diff --git a/contrib/hg/build-hgignore.pl b/contrib/hg/build-hgignore.pl index e2c07561e1..89c50b0163 100755 --- a/contrib/hg/build-hgignore.pl +++ b/contrib/hg/build-hgignore.pl @@ -32,6 +32,8 @@ my @globals = qw/.libs *.orig *.rej *.class +*.xcscheme +*.plist .git* .DS_Store stamp-h[1-9] diff --git a/contrib/platform/iu/odin/debug-nopmi b/contrib/platform/iu/odin/debug-nopmi new file mode 100644 index 0000000000..f146468200 --- /dev/null +++ b/contrib/platform/iu/odin/debug-nopmi @@ -0,0 +1,28 @@ +enable_opal_multi_threads=no +enable_dlopen=no +enable_pty_support=no +with_blcr=no +with_openib=no +with_memory_manager=no +enable_mem_debug=yes +enable_mem_profile=no +enable_debug_symbols=yes +enable_binaries=yes +with_devel_headers=yes +enable_heterogeneous=no +enable_picky=yes +enable_debug=yes +enable_shared=yes +enable_static=no +with_slurm=yes +with_pmi=no +enable_contrib_no_build=libnbc,vt +enable_visibility=yes +enable_memchecker=no +enable_ipv6=no +enable_mpi_f77=no +enable_mpi_f90=no +enable_mpi_cxx=no +enable_mpi_cxx_seek=no +enable_mca_no_build=pml-dr,pml-crcp2,crcp +enable_io_romio=no diff --git a/contrib/platform/iu/odin/debug-nopmi.conf b/contrib/platform/iu/odin/debug-nopmi.conf new file mode 100644 index 0000000000..2116035dff --- /dev/null +++ b/contrib/platform/iu/odin/debug-nopmi.conf @@ -0,0 +1,85 @@ +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# This is the default system-wide MCA parameters defaults file. +# Specifically, the MCA parameter "mca_param_files" defaults to a +# value of +# "$HOME/.openmpi/mca-params.conf:$sysconf/openmpi-mca-params.conf" +# (this file is the latter of the two). So if the default value of +# mca_param_files is not changed, this file is used to set system-wide +# MCA parameters. This file can therefore be used to set system-wide +# default MCA parameters for all users. Of course, users can override +# these values if they want, but this file is an excellent location +# for setting system-specific MCA parameters for those users who don't +# know / care enough to investigate the proper values for them. + +# Note that this file is only applicable where it is visible (in a +# filesystem sense). Specifically, MPI processes each read this file +# during their startup to determine what default values for MCA +# parameters should be used. mpirun does not bundle up the values in +# this file from the node where it was run and send them to all nodes; +# the default value decisions are effectively distributed. Hence, +# these values are only applicable on nodes that "see" this file. If +# $sysconf is a directory on a local disk, it is likely that changes +# to this file will need to be propagated to other nodes. If $sysconf +# is a directory that is shared via a networked filesystem, changes to +# this file will be visible to all nodes that share this $sysconf. + +# The format is straightforward: one per line, mca_param_name = +# rvalue. Quoting is ignored (so if you use quotes or escape +# characters, they'll be included as part of the value). For example: + +# Disable run-time MPI parameter checking +# mpi_param_check = 0 + +# Note that the value "~/" will be expanded to the current user's home +# directory. For example: + +# Change component loading path +# component_path = /usr/local/lib/openmpi:~/my_openmpi_components + +# See "ompi_info --param all all" for a full listing of Open MPI MCA +# parameters available and their default values. +# + +# Basic behavior to smooth startup +mca_component_show_load_errors = 0 +mpi_param_check = 0 +orte_abort_timeout = 10 +hwloc_base_mem_bind_failure_action = silent + +## Protect the shared file systems + +## Add the interface for out-of-band communication +## and set it up +oob_tcp_peer_retries = 120 +oob_tcp_disable_family = IPv6 +#oob_tcp_connect_timeout=600 + +## Define the MPI interconnects +btl = sm,tcp,self + +## Setup shared memory +btl_sm_free_list_max = 768 + +## Setup TCP +btl_tcp_if_include = ib0 + +## Configure the PML +pml_ob1_use_early_completion = 0 diff --git a/ompi/attribute/attribute.c b/ompi/attribute/attribute.c index ce8f19e918..9b733c2687 100644 --- a/ompi/attribute/attribute.c +++ b/ompi/attribute/attribute.c @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2006-2012 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2012 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -195,7 +197,6 @@ #include "ompi/attribute/attribute.h" #include "opal/class/opal_bitmap.h" #include "opal/threads/mutex.h" -#include "opal/util/opal_sos.h" #include "ompi/constants.h" #include "ompi/datatype/ompi_datatype.h" @@ -1151,7 +1152,7 @@ static int get_value(opal_hash_table_t *attr_hash, int key, (void**) &keyval); OPAL_THREAD_UNLOCK(&keyval_hash_lock); - if (OMPI_ERR_NOT_FOUND == OPAL_SOS_GET_ERROR_CODE(ret)) { + if (OMPI_ERR_NOT_FOUND == ret) { return MPI_KEYVAL_INVALID; } diff --git a/ompi/communicator/comm_cid.c b/ompi/communicator/comm_cid.c index bbd5a9fa9d..af9f69fdd8 100644 --- a/ompi/communicator/comm_cid.c +++ b/ompi/communicator/comm_cid.c @@ -14,6 +14,8 @@ * Copyright (c) 2007 Voltaire All rights reserved. * Copyright (c) 2006-2010 University of Houston. All rights reserved. * Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2012 Los Alamos National Security, LLC. All rights + * reserved. * Copyright (c) 2012 Oak Ridge National Labs. All rights reserved. * $COPYRIGHT$ * @@ -32,7 +34,6 @@ #include "ompi/constants.h" #include "opal/class/opal_pointer_array.h" #include "opal/class/opal_list.h" -#include "opal/util/opal_sos.h" #include "ompi/mca/pml/pml.h" #include "ompi/mca/coll/base/base.h" #include "ompi/request/request.h" @@ -145,7 +146,7 @@ int ompi_comm_cid_init (void) ompi_comm_world_thread_level_mult = 1; break; } - } else if (OMPI_ERR_NOT_IMPLEMENTED == OPAL_SOS_GET_ERROR_CODE(ret)) { + } else if (OMPI_ERR_NOT_IMPLEMENTED == ret) { if (ompi_mpi_thread_multiple) { ompi_comm_world_thread_level_mult = 1; } diff --git a/ompi/datatype/ompi_datatype.h b/ompi/datatype/ompi_datatype.h index 5016980e34..fdd320b055 100644 --- a/ompi/datatype/ompi_datatype.h +++ b/ompi/datatype/ompi_datatype.h @@ -32,6 +32,9 @@ #ifdef HAVE_STRING_H #include #endif +#ifdef HAVE_LIMITS_H +#include +#endif #include "ompi/constants.h" #include "opal/class/opal_pointer_array.h" diff --git a/ompi/errhandler/errcode-internal.h b/ompi/errhandler/errcode-internal.h index f9d022209b..183f2ec65e 100644 --- a/ompi/errhandler/errcode-internal.h +++ b/ompi/errhandler/errcode-internal.h @@ -12,6 +12,8 @@ * All rights reserved. * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -29,7 +31,6 @@ #include "ompi/constants.h" #include "opal/class/opal_object.h" #include "opal/class/opal_pointer_array.h" -#include "opal/util/opal_sos.h" #define OMPI_MAX_ERROR_STRING 64 @@ -51,18 +52,13 @@ OMPI_DECLSPEC extern opal_pointer_array_t ompi_errcodes_intern; OMPI_DECLSPEC extern int ompi_errcode_intern_lastused; /** - * Return the MPI errcode for a given internal error code. This - * function guarantees to return a non-OPAL_SOS-encoded error code. - */ + * Return the MPI errcode for a given internal error code. */ static inline int ompi_errcode_get_mpi_code(int errcode) { int ret = MPI_ERR_UNKNOWN; int i; ompi_errcode_intern_t *errc; - /* Transmogrify, if necessary */ - errcode = OPAL_SOS_GET_ERROR_CODE(errcode); - /* If the errcode is >= 0, then it's already an MPI error code, so just return it. */ if (errcode >= 0) { diff --git a/ompi/mca/bml/bml.h b/ompi/mca/bml/bml.h index 9314fd953d..fb621b1c3a 100644 --- a/ompi/mca/bml/bml.h +++ b/ompi/mca/bml/bml.h @@ -10,6 +10,8 @@ * Copyright (c) 2004-2006 The Regents of the University of California. * All rights reserved. * Copyright (c) 2008 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2012 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -31,7 +33,6 @@ #include "opal/datatype/opal_convertor.h" #include "opal/mca/crs/crs.h" #include "opal/mca/crs/base/base.h" -#include "opal/util/opal_sos.h" #include "ompi/mca/btl/btl.h" @@ -273,7 +274,7 @@ static inline int mca_bml_base_send( mca_bml_base_btl_t* bml_btl, des->des_context = (void*) bml_btl; rc = btl->btl_send(btl, bml_btl->btl_endpoint, des, tag); - if(OPAL_SOS_GET_ERROR_CODE(rc) == OMPI_ERR_RESOURCE_BUSY) + if (rc == OMPI_ERR_RESOURCE_BUSY) rc = OMPI_SUCCESS; return rc; diff --git a/ompi/mca/bml/r2/bml_r2.c b/ompi/mca/bml/r2/bml_r2.c index ac5b85b874..b4d58762b1 100644 --- a/ompi/mca/bml/r2/bml_r2.c +++ b/ompi/mca/bml/r2/bml_r2.c @@ -10,7 +10,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2006 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2007 Los Alamos National Security, LLC. All rights + * Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2008-2009 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ @@ -404,7 +404,7 @@ static int mca_bml_r2_add_procs( size_t nprocs, } if (mca_bml_r2.show_unreach_errors && - OMPI_ERR_UNREACH == OPAL_SOS_GET_ERROR_CODE(ret)) { + OMPI_ERR_UNREACH == ret) { orte_show_help("help-mca-bml-r2.txt", "unreachable proc", true, diff --git a/ompi/mca/bml/r2/bml_r2_ft.c b/ompi/mca/bml/r2/bml_r2_ft.c index 164dd4b841..72e73accc6 100644 --- a/ompi/mca/bml/r2/bml_r2_ft.c +++ b/ompi/mca/bml/r2/bml_r2_ft.c @@ -9,7 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2006 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2007 Los Alamos National Security, LLC. All rights + * Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2008 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ @@ -25,12 +25,16 @@ #include #include +#include "opal/runtime/opal_progress.h" + +#include "orte/mca/grpcomm/grpcomm.h" +#include "orte/util/proc_info.h" + #include "ompi/runtime/ompi_cr.h" #include "ompi/mca/bml/base/base.h" #include "ompi/mca/btl/base/base.h" #include "ompi/mca/bml/base/bml_base_btl.h" #include "ompi/mca/pml/base/base.h" -#include "orte/mca/grpcomm/grpcomm.h" #include "ompi/proc/proc.h" #include "bml_r2.h" @@ -47,6 +51,7 @@ int mca_bml_r2_ft_event(int state) int loc_state; int param_type = -1; char *param_list = NULL; + orte_grpcomm_collective_t coll; if(OPAL_CRS_CHECKPOINT == state) { /* Do nothing for now */ @@ -153,10 +158,15 @@ int mca_bml_r2_ft_event(int state) * Barrier to make all processes have been successfully restarted before * we try to remove some restart only files. */ - if (ORTE_SUCCESS != (ret = orte_grpcomm.barrier())) { + OBJ_CONSTRUCT(&coll, orte_grpcomm_collective_t); + coll.id = orte_process_info.peer_init_barrier; + if (OMPI_SUCCESS != (ret = orte_grpcomm.barrier(&coll))) { opal_output(0, "bml:r2: ft_event(Restart): Failed in orte_grpcomm.barrier (%d)", ret); return ret; } + while (coll.active) { + opal_progress(); + } /* * Re-open the BTL framework to get the full list of components. @@ -226,10 +236,15 @@ int mca_bml_r2_ft_event(int state) * Barrier to make all processes have been successfully restarted before * we try to remove some restart only files. */ - if (ORTE_SUCCESS != (ret = orte_grpcomm.barrier())) { + OBJ_CONSTRUCT(&coll, orte_grpcomm_collective_t); + coll.id = orte_process_info.peer_init_barrier; + if (OMPI_SUCCESS != (ret = orte_grpcomm.barrier(&coll))) { opal_output(0, "bml:r2: ft_event(Restart): Failed in orte_grpcomm.barrier (%d)", ret); return ret; } + while (coll.active) { + opal_progress(); + } /* * Re-open the BTL framework to get the full list of components. diff --git a/ompi/mca/btl/ofud/btl_ofud_component.c b/ompi/mca/btl/ofud/btl_ofud_component.c index a3e9659eee..ccbb63b63c 100644 --- a/ompi/mca/btl/ofud/btl_ofud_component.c +++ b/ompi/mca/btl/ofud/btl_ofud_component.c @@ -12,6 +12,8 @@ * Copyright (c) 2006 Sandia National Laboratories. All rights * reserved. * Copyright (c) 2008-2011 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2012 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -34,7 +36,6 @@ #include "ompi/mca/btl/btl.h" #include "opal/mca/timer/base/base.h" #include "opal/util/argv.h" -#include "opal/util/opal_sos.h" #include "opal/mca/base/mca_base_param.h" #include "orte/mca/errmgr/errmgr.h" #include "ompi/mca/btl/base/base.h" diff --git a/ompi/mca/btl/openib/btl_openib.c b/ompi/mca/btl/openib/btl_openib.c index 7cdcf6e008..e39e6e44ec 100644 --- a/ompi/mca/btl/openib/btl_openib.c +++ b/ompi/mca/btl/openib/btl_openib.c @@ -12,7 +12,7 @@ * All rights reserved. * Copyright (c) 2007-2010 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved. - * Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights + * Copyright (c) 2006-2012 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2006-2007 Voltaire All rights reserved. * Copyright (c) 2008-2012 Oracle and/or its affiliates. All rights reserved. @@ -34,7 +34,6 @@ #include "opal/class/opal_bitmap.h" #include "opal/util/output.h" #include "opal/util/arch.h" -#include "opal/util/opal_sos.h" #include "opal/include/opal_stdint.h" #include "ompi/mca/btl/btl.h" @@ -303,7 +302,7 @@ static int create_srq(mca_btl_openib_module_t *openib_btl) /* Check if our device supports modify srq ability */ rc = check_if_device_support_modify_srq(openib_btl); - if(OMPI_ERR_NOT_SUPPORTED == OPAL_SOS_GET_ERROR_CODE(rc)) { + if(OMPI_ERR_NOT_SUPPORTED == rc) { device_support_modify_srq = false; } else if(OMPI_SUCCESS != rc) { mca_btl_openib_show_init_error(__FILE__, __LINE__, @@ -494,7 +493,7 @@ static int mca_btl_openib_tune_endpoint(mca_btl_openib_module_t* openib_btl, endpoint->rem_info.rem_vendor_part_id, &values); if (OMPI_SUCCESS != ret && - OMPI_ERR_NOT_FOUND != OPAL_SOS_GET_ERROR_CODE(ret)) { + OMPI_ERR_NOT_FOUND != ret) { orte_show_help("help-mpi-btl-openib.txt", "error in device init", true, orte_process_info.nodename, @@ -1625,7 +1624,7 @@ int mca_btl_openib_put( mca_btl_base_module_t* btl, OPAL_THREAD_LOCK(&ep->endpoint_lock); rc = check_endpoint_state(ep, descriptor, &ep->pending_put_frags); OPAL_THREAD_UNLOCK(&ep->endpoint_lock); - if(OMPI_ERR_RESOURCE_BUSY == OPAL_SOS_GET_ERROR_CODE(rc)) + if(OMPI_ERR_RESOURCE_BUSY == rc) return OMPI_SUCCESS; if(OMPI_SUCCESS != rc) return rc; @@ -1696,7 +1695,7 @@ int mca_btl_openib_get(mca_btl_base_module_t* btl, OPAL_THREAD_LOCK(&ep->endpoint_lock); rc = check_endpoint_state(ep, descriptor, &ep->pending_get_frags); OPAL_THREAD_UNLOCK(&ep->endpoint_lock); - if(OMPI_ERR_RESOURCE_BUSY == OPAL_SOS_GET_ERROR_CODE(rc)) + if(OMPI_ERR_RESOURCE_BUSY == rc) return OMPI_SUCCESS; if(OMPI_SUCCESS != rc) return rc; diff --git a/ompi/mca/btl/openib/btl_openib_component.c b/ompi/mca/btl/openib/btl_openib_component.c index 8648b8482e..5180d7da46 100644 --- a/ompi/mca/btl/openib/btl_openib_component.c +++ b/ompi/mca/btl/openib/btl_openib_component.c @@ -12,7 +12,7 @@ * All rights reserved. * Copyright (c) 2006-2011 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved. - * Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights + * Copyright (c) 2006-2012 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2006-2007 Voltaire All rights reserved. * Copyright (c) 2009-2012 Oracle and/or its affiliates. All rights reserved. @@ -1125,8 +1125,8 @@ static int prepare_device_for_use(mca_btl_openib_device_t *device) if (OMPI_SUCCESS != rc) { /* If we're "out of memory", this usually means that we ran out of registered memory, so show that error message */ - if (OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc) || - OMPI_ERR_TEMP_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) { + if (OMPI_ERR_OUT_OF_RESOURCE == rc || + OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc) { errno = ENOMEM; mca_btl_openib_show_init_error(__FILE__, __LINE__, "ompi_free_list_init_ex_new", @@ -1161,8 +1161,8 @@ static int prepare_device_for_use(mca_btl_openib_device_t *device) /* If we're "out of memory", this usually means that we ran out of registered memory, so show that error message */ - if (OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc) || - OMPI_ERR_TEMP_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) { + if (OMPI_ERR_OUT_OF_RESOURCE == rc || + OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc) { errno = ENOMEM; mca_btl_openib_show_init_error(__FILE__, __LINE__, "ompi_free_list_init_ex_new", @@ -1658,11 +1658,11 @@ static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev) device->ib_dev_attr.vendor_part_id, &values); if (OMPI_SUCCESS != ret && - OMPI_ERR_NOT_FOUND != OPAL_SOS_GET_ERROR_CODE(ret)) { + OMPI_ERR_NOT_FOUND != ret) { /* If we get a serious error, propagate it upwards */ goto error; } - if (OMPI_ERR_NOT_FOUND == OPAL_SOS_GET_ERROR_CODE(ret)) { + if (OMPI_ERR_NOT_FOUND == ret) { /* If we didn't find a matching device in the INI files, output a warning that we're using default values (unless overridden that we don't want to see these warnings) */ @@ -1679,7 +1679,7 @@ static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev) be set indicating that it does not have good values */ ret = ompi_btl_openib_ini_query(0, 0, &default_values); if (OMPI_SUCCESS != ret && - OMPI_ERR_NOT_FOUND != OPAL_SOS_GET_ERROR_CODE(ret)) { + OMPI_ERR_NOT_FOUND != ret) { /* If we get a serious error, propagate it upwards */ goto error; } @@ -1841,7 +1841,7 @@ static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev) device, &mpool_resources); if(NULL == device->mpool){ /* Don't print an error message here -- we'll get one from - mpool_create anyway (OPAL_SOS would be good here...) */ + mpool_create anyway */ goto error; } @@ -1899,7 +1899,7 @@ static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev) if (OMPI_SUCCESS != ret) { /* Out of bounds error indicates that we hit max btl number * don't propagate the error to the caller */ - if (OMPI_ERR_VALUE_OUT_OF_BOUNDS == OPAL_SOS_GET_ERROR_CODE(ret)) { + if (OMPI_ERR_VALUE_OUT_OF_BOUNDS == ret) { ret = OMPI_SUCCESS; } break; @@ -2830,7 +2830,7 @@ btl_openib_component_init(int *num_btl_modules, /* If we get NOT_SUPPORTED, then no CPC was found for this port. But that's not a fatal error -- just keep going; let's see if we find any usable openib modules or not. */ - if (OMPI_ERR_NOT_SUPPORTED == OPAL_SOS_GET_ERROR_CODE(ret)) { + if (OMPI_ERR_NOT_SUPPORTED == ret) { continue; } else if (OMPI_SUCCESS != ret) { /* All others *are* fatal. Note that we already did a @@ -2994,7 +2994,7 @@ static int progress_no_credits_pending_frags(mca_btl_base_endpoint_t *ep) error upward. */ rc = mca_btl_openib_endpoint_post_send(ep, to_send_frag(frag)); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc && - OMPI_ERR_RESOURCE_BUSY != OPAL_SOS_GET_ERROR_CODE(rc))) { + OMPI_ERR_RESOURCE_BUSY != rc)) { OPAL_THREAD_UNLOCK(&ep->endpoint_lock); return rc; } @@ -3023,7 +3023,7 @@ void mca_btl_openib_frag_progress_pending_put_get(mca_btl_base_endpoint_t *ep, break; rc = mca_btl_openib_get((mca_btl_base_module_t *)openib_btl, ep, &to_base_frag(frag)->base); - if(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) + if(OMPI_ERR_OUT_OF_RESOURCE == rc) break; } @@ -3036,7 +3036,7 @@ void mca_btl_openib_frag_progress_pending_put_get(mca_btl_base_endpoint_t *ep, break; rc = mca_btl_openib_put((mca_btl_base_module_t*)openib_btl, ep, &to_base_frag(frag)->base); - if(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) + if(OMPI_ERR_OUT_OF_RESOURCE == rc) break; } } diff --git a/ompi/mca/btl/openib/btl_openib_endpoint.c b/ompi/mca/btl/openib/btl_openib_endpoint.c index abf980aaf4..eb0396f064 100644 --- a/ompi/mca/btl/openib/btl_openib_endpoint.c +++ b/ompi/mca/btl/openib/btl_openib_endpoint.c @@ -11,7 +11,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2006-2009 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights + * Copyright (c) 2006-2012 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2006-2007 Voltaire All rights reserved. * Copyright (c) 2006-2009 Mellanox Technologies, Inc. All rights reserved. @@ -36,7 +36,6 @@ #include "opal_stdint.h" #include "opal/util/output.h" -#include "opal/util/opal_sos.h" #include "orte/util/show_help.h" @@ -714,7 +713,7 @@ int mca_btl_openib_endpoint_send(mca_btl_base_endpoint_t* ep, rc = mca_btl_openib_endpoint_post_send(ep, frag); } OPAL_THREAD_UNLOCK(&ep->endpoint_lock); - if (OPAL_UNLIKELY(OMPI_ERR_RESOURCE_BUSY == OPAL_SOS_GET_ERROR_CODE(rc))) { + if (OPAL_UNLIKELY(OMPI_ERR_RESOURCE_BUSY == rc)) { rc = OMPI_SUCCESS; } @@ -898,7 +897,7 @@ static int mca_btl_openib_endpoint_send_eager_rdma( )); } rc = mca_btl_openib_endpoint_send(endpoint, frag); - if (OMPI_SUCCESS == rc || OMPI_ERR_RESOURCE_BUSY == OPAL_SOS_GET_ERROR_CODE(rc)) + if (OMPI_SUCCESS == rc || OMPI_ERR_RESOURCE_BUSY == rc) return OMPI_SUCCESS; MCA_BTL_IB_FRAG_RETURN(frag); diff --git a/ompi/mca/btl/openib/btl_openib_failover.c b/ompi/mca/btl/openib/btl_openib_failover.c index 47c5ec5adc..17d78bf7e1 100644 --- a/ompi/mca/btl/openib/btl_openib_failover.c +++ b/ompi/mca/btl/openib/btl_openib_failover.c @@ -1,6 +1,8 @@ /* * Copyright (c) 2010-2011 Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2012 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -28,8 +30,6 @@ #include "btl_openib_proc.h" #include "btl_openib_failover.h" -#include "opal/util/opal_sos.h" - static void error_out_all_pending_frags(mca_btl_base_endpoint_t *ep, struct mca_btl_base_module_t* module, bool errout); @@ -691,7 +691,7 @@ static void mca_btl_openib_endpoint_notify(mca_btl_base_endpoint_t* endpoint, ui BTL_OPENIB_BROKEN_CONNECTION_HEADER_HTON((*bc_hdr)); } rc = mca_btl_openib_endpoint_send(newep, frag); - if (OMPI_SUCCESS == rc || OMPI_ERR_RESOURCE_BUSY == OPAL_SOS_GET_ERROR_CODE(rc)) { + if (OMPI_SUCCESS == rc || OMPI_ERR_RESOURCE_BUSY == rc) { return; } diff --git a/ompi/mca/btl/openib/btl_openib_ini.c b/ompi/mca/btl/openib/btl_openib_ini.c index b9d0a89df5..f13156d416 100644 --- a/ompi/mca/btl/openib/btl_openib_ini.c +++ b/ompi/mca/btl/openib/btl_openib_ini.c @@ -11,6 +11,8 @@ * All rights reserved. * Copyright (c) 2006-2009 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2008 Mellanox Technologies. All rights reserved. + * Copyright (c) 2012 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -29,7 +31,6 @@ #endif #include "orte/util/show_help.h" -#include "opal/util/opal_sos.h" #include "opal/mca/base/mca_base_param.h" #include "btl_openib.h" @@ -133,13 +134,13 @@ int ompi_btl_openib_ini_init(void) /* Note that NOT_FOUND and SUCCESS are not fatal errors and we keep going. Other errors are treated as fatal */ - if (OMPI_ERR_NOT_FOUND != OPAL_SOS_GET_ERROR_CODE(ret) && OMPI_SUCCESS != ret) { + if (OMPI_ERR_NOT_FOUND != ret && OMPI_SUCCESS != ret) { break; } str = colon + 1; } /* Parse the last file if we didn't have a fatal error above */ - if (OMPI_ERR_NOT_FOUND != OPAL_SOS_GET_ERROR_CODE(ret) && OMPI_SUCCESS != ret) { + if (OMPI_ERR_NOT_FOUND != ret && OMPI_SUCCESS != ret) { ret = parse_file(str); } @@ -150,7 +151,7 @@ int ompi_btl_openib_ini_init(void) /* Return SUCCESS unless we got a fatal error */ initialized = true; - return (OMPI_SUCCESS == ret || OMPI_ERR_NOT_FOUND == OPAL_SOS_GET_ERROR_CODE(ret)) ? + return (OMPI_SUCCESS == ret || OMPI_ERR_NOT_FOUND == ret) ? OMPI_SUCCESS : ret; } diff --git a/ompi/mca/btl/openib/connect/btl_openib_connect_base.c b/ompi/mca/btl/openib/connect/btl_openib_connect_base.c index db098f83cb..8caf672242 100644 --- a/ompi/mca/btl/openib/connect/btl_openib_connect_base.c +++ b/ompi/mca/btl/openib/connect/btl_openib_connect_base.c @@ -1,6 +1,8 @@ /* * Copyright (c) 2007-2009 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2007 Mellanox Technologies, Inc. All rights reserved. + * Copyright (c) 2012 Los Alamos National Security, LLC. All rights + * reserved. * * $COPYRIGHT$ * @@ -28,7 +30,6 @@ #include "orte/util/show_help.h" #include "opal/util/argv.h" #include "opal/util/output.h" -#include "opal/util/opal_sos.h" /* * Array of all possible connection functions @@ -219,7 +220,7 @@ int ompi_btl_openib_connect_base_init(void) opal_output(-1, "found available cpc (SUCCESS init): %s", all[i]->cbc_name); continue; - } else if (OMPI_ERR_NOT_SUPPORTED == OPAL_SOS_GET_ERROR_CODE(rc)) { + } else if (OMPI_ERR_NOT_SUPPORTED == rc) { continue; } else { return rc; @@ -265,8 +266,7 @@ int ompi_btl_openib_connect_base_select_for_local_port(mca_btl_openib_module_t * strcat(msg, available[i]->cbc_name); rc = available[i]->cbc_query(btl, &cpcs[cpc_index]); - if (OMPI_ERR_NOT_SUPPORTED == OPAL_SOS_GET_ERROR_CODE(rc) || - OMPI_ERR_UNREACH == OPAL_SOS_GET_ERROR_CODE(rc)) { + if (OMPI_ERR_NOT_SUPPORTED == rc || OMPI_ERR_UNREACH == rc) { continue; } else if (OMPI_SUCCESS != rc) { free(cpcs); diff --git a/ompi/mca/btl/openib/connect/btl_openib_connect_oob.c b/ompi/mca/btl/openib/connect/btl_openib_connect_oob.c index 8c54783495..0ca94cf030 100644 --- a/ompi/mca/btl/openib/connect/btl_openib_connect_oob.c +++ b/ompi/mca/btl/openib/connect/btl_openib_connect_oob.c @@ -10,7 +10,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2006-2009 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2006 Los Alamos National Security, LLC. All rights + * Copyright (c) 2006-2012 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2008-2011 Mellanox Technologies. All rights reserved. * Copyright (c) 2009-2011 IBM Corporation. All rights reserved. @@ -30,7 +30,6 @@ #include "orte/util/show_help.h" #include "opal/util/error.h" #include "opal/util/output.h" -#include "opal/util/opal_sos.h" #include "orte/mca/rml/rml.h" #include "orte/mca/rml/rml_types.h" #include "orte/mca/errmgr/errmgr.h" diff --git a/ompi/mca/btl/openib/connect/btl_openib_connect_rdmacm.c b/ompi/mca/btl/openib/connect/btl_openib_connect_rdmacm.c index 2b57793b75..6eb59a7211 100644 --- a/ompi/mca/btl/openib/connect/btl_openib_connect_rdmacm.c +++ b/ompi/mca/btl/openib/connect/btl_openib_connect_rdmacm.c @@ -4,6 +4,8 @@ * Copyright (c) 2008 Mellanox Technologies. All rights reserved. * Copyright (c) 2009 Sandia National Laboratories. All rights reserved. * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 Los Alamos National Security, LLC. All rights + * reserved. * * $COPYRIGHT$ * @@ -46,7 +48,6 @@ #include "opal/util/output.h" #include "opal/util/error.h" -#include "opal/util/opal_sos.h" #include "orte/util/show_help.h" #include "btl_openib_fd.h" @@ -1932,7 +1933,7 @@ out3: out1: free(*cpc); out: - if (OMPI_ERR_NOT_SUPPORTED == OPAL_SOS_GET_ERROR_CODE(rc)) { + if (OMPI_ERR_NOT_SUPPORTED == rc) { opal_output_verbose(5, mca_btl_base_output, "openib BTL: rdmacm CPC unavailable for use on %s:%d; skipped", ibv_get_device_name(openib_btl->device->ib_dev), diff --git a/ompi/mca/btl/openib/connect/btl_openib_connect_xoob.c b/ompi/mca/btl/openib/connect/btl_openib_connect_xoob.c index d964f6fe3c..7bc3eaf37d 100644 --- a/ompi/mca/btl/openib/connect/btl_openib_connect_xoob.c +++ b/ompi/mca/btl/openib/connect/btl_openib_connect_xoob.c @@ -5,6 +5,8 @@ * Copyright (c) 2010-2011 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2012 Los Alamos National Security, LLC. All rights + * reserved. * * $COPYRIGHT$ * @@ -21,7 +23,6 @@ #include "opal/util/output.h" #include "orte/util/show_help.h" #include "orte/util/name_fns.h" -#include "opal/util/opal_sos.h" #include "orte/mca/rml/rml.h" #include "orte/mca/rml/rml_types.h" #include "orte/mca/errmgr/errmgr.h" @@ -698,10 +699,8 @@ static mca_btl_openib_endpoint_t* xoob_find_endpoint(orte_process_name_t* proces BTL_VERBOSE(("Searching for ep and proc with follow parameters:" "jobid %d, vpid %d, " - "epoch %d, " "sid %" PRIx64 ", lid %d", process_name->jobid, process_name->vpid, - ORTE_EPOCH_GET(process_name), subnet_id, lid)); diff --git a/ompi/mca/btl/portals/btl_portals.c b/ompi/mca/btl/portals/btl_portals.c index d970f3ac41..d756b45f32 100644 --- a/ompi/mca/btl/portals/btl_portals.c +++ b/ompi/mca/btl/portals/btl_portals.c @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2008 UT-Battelle, LLC. All rights reserved. + * Copyright (c) 2012 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -29,7 +31,6 @@ #include "ompi/constants.h" #include "ompi/mca/btl/btl.h" #include "opal/datatype/opal_convertor.h" -#include "opal/util/opal_sos.h" #include "btl_portals.h" #include "btl_portals_endpoint.h" diff --git a/ompi/mca/btl/portals/btl_portals_frag.h b/ompi/mca/btl/portals/btl_portals_frag.h index 797884048d..b4fe43af94 100644 --- a/ompi/mca/btl/portals/btl_portals_frag.h +++ b/ompi/mca/btl/portals/btl_portals_frag.h @@ -9,6 +9,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -67,7 +69,7 @@ OBJ_CLASS_DECLARATION(mca_btl_portals_frag_recv_t); ompi_free_list_item_t *item; \ OMPI_FREE_LIST_GET(&((mca_btl_portals_module_t*)btl_macro)->portals_frag_eager, item, rc); \ frag = (mca_btl_portals_frag_t*) item; \ - if (OPAL_SOS_GET_ERROR_CODE(rc) == OMPI_ERR_TEMP_OUT_OF_RESOURCE) { \ + if (rc == OMPI_ERR_TEMP_OUT_OF_RESOURCE) { \ OMPI_BTL_PORTALS_FRAG_ALLOC_MAX(btl_macro, frag, rc); \ } \ } diff --git a/ompi/mca/btl/portals/btl_portals_send.c b/ompi/mca/btl/portals/btl_portals_send.c index 819c8f17d8..c09fb915c4 100644 --- a/ompi/mca/btl/portals/btl_portals_send.c +++ b/ompi/mca/btl/portals/btl_portals_send.c @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2008 UT-Battelle, LLC. All rights reserved. + * Copyright (c) 2012 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -24,7 +26,6 @@ #include "ompi/constants.h" #include "opal/datatype/opal_convertor.h" -#include "opal/util/opal_sos.h" #include "btl_portals.h" #include "btl_portals_send.h" diff --git a/ompi/mca/btl/tcp/btl_tcp_component.c b/ompi/mca/btl/tcp/btl_tcp_component.c index bd30e74eb0..8b1866ba96 100644 --- a/ompi/mca/btl/tcp/btl_tcp_component.c +++ b/ompi/mca/btl/tcp/btl_tcp_component.c @@ -12,6 +12,8 @@ * Copyright (c) 2007-2011 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2009 Oak Ridge National Laboratory + * Copyright (c) 2012 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -53,7 +55,6 @@ #include "opal/util/output.h" #include "opal/util/argv.h" #include "opal/util/net.h" -#include "opal/util/opal_sos.h" #include "opal/mca/base/mca_base_param.h" #include "orte/types.h" @@ -1055,7 +1056,7 @@ mca_btl_base_module_t** mca_btl_tcp_component_init(int *num_btl_modules, } #if OPAL_WANT_IPV6 if((ret = mca_btl_tcp_component_create_listen(AF_INET6)) != OMPI_SUCCESS) { - if (!(OMPI_ERR_IN_ERRNO == OPAL_SOS_GET_ERROR_CODE(ret) && + if (!(OMPI_ERR_IN_ERRNO == ret && EAFNOSUPPORT == opal_socket_errno)) { opal_output (0, "mca_btl_tcp_component: IPv6 listening socket failed\n"); return 0; diff --git a/ompi/mca/btl/ugni/btl_ugni_put.c b/ompi/mca/btl/ugni/btl_ugni_put.c index 4e38194892..1f67a9e291 100644 --- a/ompi/mca/btl/ugni/btl_ugni_put.c +++ b/ompi/mca/btl/ugni/btl_ugni_put.c @@ -13,7 +13,6 @@ #include "opal/include/opal_stdint.h" #include "btl_ugni_rdma.h" -#include "opal/util/opal_sos.h" /** * Initiate a put operation. diff --git a/ompi/mca/btl/wv/btl_wv.c b/ompi/mca/btl/wv/btl_wv.c index b03d312815..197a72b64c 100644 --- a/ompi/mca/btl/wv/btl_wv.c +++ b/ompi/mca/btl/wv/btl_wv.c @@ -12,7 +12,7 @@ * All rights reserved. * Copyright (c) 2007-2010 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved. - * Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights + * Copyright (c) 2006-2012 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2006-2007 Voltaire All rights reserved. * Copyright (c) 2008-2010 Oracle and/or its affiliates. All rights reserved. @@ -32,7 +32,6 @@ #include "opal/class/opal_bitmap.h" #include "opal/util/output.h" #include "opal/util/arch.h" -#include "opal/util/opal_sos.h" #include "ompi/mca/btl/btl.h" #include "ompi/mca/btl/base/btl_base_error.h" @@ -309,7 +308,7 @@ static int mca_btl_wv_tune_endpoint(mca_btl_wv_module_t* wv_btl, endpoint->rem_info.rem_vendor_part_id, &values); if (OMPI_SUCCESS != ret && - OMPI_ERR_NOT_FOUND != OPAL_SOS_GET_ERROR_CODE(ret)) { + OMPI_ERR_NOT_FOUND != ret) { orte_show_help("help-mpi-btl-wv.txt", "error in device init", true, orte_process_info.nodename, @@ -1347,7 +1346,7 @@ int mca_btl_wv_put(mca_btl_base_module_t* btl, OPAL_THREAD_LOCK(&ep->endpoint_lock); rc = check_endpoint_state(ep, descriptor, &ep->pending_put_frags); OPAL_THREAD_UNLOCK(&ep->endpoint_lock); - if(OMPI_ERR_RESOURCE_BUSY == OPAL_SOS_GET_ERROR_CODE(rc)) + if(OMPI_ERR_RESOURCE_BUSY == rc) return OMPI_SUCCESS; if(OMPI_SUCCESS != rc) return rc; @@ -1406,7 +1405,7 @@ int mca_btl_wv_get(mca_btl_base_module_t* btl, OPAL_THREAD_LOCK(&ep->endpoint_lock); rc = check_endpoint_state(ep, descriptor, &ep->pending_get_frags); OPAL_THREAD_UNLOCK(&ep->endpoint_lock); - if(OMPI_ERR_RESOURCE_BUSY == OPAL_SOS_GET_ERROR_CODE(rc)) + if(OMPI_ERR_RESOURCE_BUSY == rc) return OMPI_SUCCESS; if(OMPI_SUCCESS != rc) return rc; diff --git a/ompi/mca/btl/wv/btl_wv_component.c b/ompi/mca/btl/wv/btl_wv_component.c index e7d47c0409..12dc8c8e25 100644 --- a/ompi/mca/btl/wv/btl_wv_component.c +++ b/ompi/mca/btl/wv/btl_wv_component.c @@ -12,7 +12,7 @@ * All rights reserved. * Copyright (c) 2006-2011 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved. - * Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights + * Copyright (c) 2006-2012 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2006-2007 Voltaire All rights reserved. * Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved. @@ -784,8 +784,8 @@ static int prepare_device_for_use(mca_btl_wv_device_t *device) if (OMPI_SUCCESS != rc) { /* If we're "out of memory", this usually means that we ran out of registered memory, so show that error message */ - if (OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc) || - OMPI_ERR_TEMP_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) { + if (OMPI_ERR_OUT_OF_RESOURCE == rc || + OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc) { errno = ENOMEM; mca_btl_wv_show_init_error(__FILE__, __LINE__, "ompi_free_list_init_ex_new", @@ -820,8 +820,8 @@ static int prepare_device_for_use(mca_btl_wv_device_t *device) /* If we're "out of memory", this usually means that we ran out of registered memory, so show that error message */ - if (OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc) || - OMPI_ERR_TEMP_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) { + if (OMPI_ERR_OUT_OF_RESOURCE == rc || + OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc) { errno = ENOMEM; mca_btl_wv_show_init_error(__FILE__, __LINE__, "ompi_free_list_init_ex_new", @@ -1312,11 +1312,11 @@ static int init_one_device(opal_list_t *btl_list, struct wv_device* ib_dev) device->ib_dev_attr.VendorPartId, &values); if (OMPI_SUCCESS != ret && - OMPI_ERR_NOT_FOUND != OPAL_SOS_GET_ERROR_CODE(ret)) { + OMPI_ERR_NOT_FOUND != ret) { /* If we get a serious error, propagate it upwards */ goto error; } - if (OMPI_ERR_NOT_FOUND == OPAL_SOS_GET_ERROR_CODE(ret)) { + if (OMPI_ERR_NOT_FOUND == ret) { /* If we didn't find a matching device in the INI files, output a warning that we're using default values (unless overridden that we don't want to see these warnings) */ @@ -1333,7 +1333,7 @@ static int init_one_device(opal_list_t *btl_list, struct wv_device* ib_dev) be set indicating that it does not have good values */ ret = ompi_btl_wv_ini_query(0, 0, &default_values); if (OMPI_SUCCESS != ret && - OMPI_ERR_NOT_FOUND != OPAL_SOS_GET_ERROR_CODE(ret)) { + OMPI_ERR_NOT_FOUND != ret) { /* If we get a serious error, propagate it upwards */ goto error; } @@ -1429,7 +1429,7 @@ static int init_one_device(opal_list_t *btl_list, struct wv_device* ib_dev) device, &mpool_resources); if(NULL == device->mpool){ /* Don't print an error message here -- we'll get one from - mpool_create anyway (OPAL_SOS would be good here...) */ + mpool_create anyway */ goto error; } @@ -1481,7 +1481,7 @@ static int init_one_device(opal_list_t *btl_list, struct wv_device* ib_dev) if (OMPI_SUCCESS != ret) { /* Out of bounds error indicates that we hit max btl number * don't propagate the error to the caller */ - if (OMPI_ERR_VALUE_OUT_OF_BOUNDS == OPAL_SOS_GET_ERROR_CODE(ret)) { + if (OMPI_ERR_VALUE_OUT_OF_BOUNDS == ret) { ret = OMPI_SUCCESS; } break; @@ -2313,7 +2313,7 @@ btl_wv_component_init(int *num_btl_modules, /* If we get NOT_SUPPORTED, then no CPC was found for this port. But that's not a fatal error -- just keep going; let's see if we find any usable wv modules or not. */ - if (OMPI_ERR_NOT_SUPPORTED == OPAL_SOS_GET_ERROR_CODE(ret)) { + if (OMPI_ERR_NOT_SUPPORTED == ret) { continue; } else if (OMPI_SUCCESS != ret) { /* All others *are* fatal. Note that we already did a @@ -2469,7 +2469,7 @@ static int progress_no_credits_pending_frags(mca_btl_base_endpoint_t *ep) error upward. */ rc = mca_btl_wv_endpoint_post_send(ep, to_send_frag(frag)); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc && - OMPI_ERR_RESOURCE_BUSY != OPAL_SOS_GET_ERROR_CODE(rc))) { + OMPI_ERR_RESOURCE_BUSY != rc)) { OPAL_THREAD_UNLOCK(&ep->endpoint_lock); return rc; } @@ -2497,7 +2497,7 @@ void mca_btl_wv_frag_progress_pending_put_get(mca_btl_base_endpoint_t *ep, break; rc = mca_btl_wv_get((mca_btl_base_module_t *)wv_btl, ep, &to_base_frag(frag)->base); - if(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) + if(OMPI_ERR_OUT_OF_RESOURCE == rc) break; } @@ -2510,7 +2510,7 @@ void mca_btl_wv_frag_progress_pending_put_get(mca_btl_base_endpoint_t *ep, break; rc = mca_btl_wv_put((mca_btl_base_module_t *)wv_btl, ep, &to_base_frag(frag)->base); - if(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) + if(OMPI_ERR_OUT_OF_RESOURCE == rc) break; } } diff --git a/ompi/mca/btl/wv/btl_wv_endpoint.c b/ompi/mca/btl/wv/btl_wv_endpoint.c index 6319238438..2a37877e50 100644 --- a/ompi/mca/btl/wv/btl_wv_endpoint.c +++ b/ompi/mca/btl/wv/btl_wv_endpoint.c @@ -11,7 +11,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2006-2009 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights + * Copyright (c) 2006-2012 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2006-2007 Voltaire All rights reserved. * Copyright (c) 2006-2009 Mellanox Technologies, Inc. All rights reserved. @@ -33,7 +33,6 @@ #include "opal_stdint.h" #include "opal/util/output.h" -#include "opal/util/opal_sos.h" #include "orte/util/show_help.h" @@ -617,7 +616,7 @@ int mca_btl_wv_endpoint_send(mca_btl_base_endpoint_t* ep, rc = mca_btl_wv_endpoint_post_send(ep, frag); } OPAL_THREAD_UNLOCK(&ep->endpoint_lock); - if (OPAL_UNLIKELY(OMPI_ERR_RESOURCE_BUSY == OPAL_SOS_GET_ERROR_CODE(rc))) { + if (OPAL_UNLIKELY(OMPI_ERR_RESOURCE_BUSY == rc)) { rc = OMPI_SUCCESS; } @@ -801,7 +800,7 @@ static int mca_btl_wv_endpoint_send_eager_rdma( )); } rc = mca_btl_wv_endpoint_send(endpoint, frag); - if (OMPI_SUCCESS == rc || OMPI_ERR_RESOURCE_BUSY == OPAL_SOS_GET_ERROR_CODE(rc)) + if (OMPI_SUCCESS == rc || OMPI_ERR_RESOURCE_BUSY == rc) return OMPI_SUCCESS; MCA_BTL_IB_FRAG_RETURN(frag); diff --git a/ompi/mca/btl/wv/btl_wv_ini.c b/ompi/mca/btl/wv/btl_wv_ini.c index 8ee2f171e7..dd593e9913 100644 --- a/ompi/mca/btl/wv/btl_wv_ini.c +++ b/ompi/mca/btl/wv/btl_wv_ini.c @@ -11,6 +11,8 @@ * All rights reserved. * Copyright (c) 2006-2009 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2008 Mellanox Technologies. All rights reserved. + * Copyright (c) 2012 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -29,7 +31,6 @@ #endif #include "orte/util/show_help.h" -#include "opal/util/opal_sos.h" #include "opal/mca/base/mca_base_param.h" #include "btl_wv.h" @@ -127,13 +128,13 @@ int ompi_btl_wv_ini_init(void) /* Note that NOT_FOUND and SUCCESS are not fatal errors and we keep going. Other errors are treated as fatal */ - if (OMPI_ERR_NOT_FOUND != OPAL_SOS_GET_ERROR_CODE(ret) && OMPI_SUCCESS != ret) { + if (OMPI_ERR_NOT_FOUND != ret && OMPI_SUCCESS != ret) { break; } str = colon + 1; } /* Parse the last file if we didn't have a fatal error above */ - if (OMPI_ERR_NOT_FOUND != OPAL_SOS_GET_ERROR_CODE(ret) && OMPI_SUCCESS != ret) { + if (OMPI_ERR_NOT_FOUND != ret && OMPI_SUCCESS != ret) { ret = parse_file(str); } @@ -144,7 +145,7 @@ int ompi_btl_wv_ini_init(void) /* Return SUCCESS unless we got a fatal error */ initialized = true; - return (OMPI_SUCCESS == ret || OMPI_ERR_NOT_FOUND == OPAL_SOS_GET_ERROR_CODE(ret)) ? + return (OMPI_SUCCESS == ret || OMPI_ERR_NOT_FOUND == ret) ? OMPI_SUCCESS : ret; } diff --git a/ompi/mca/btl/wv/connect/btl_wv_connect_base.c b/ompi/mca/btl/wv/connect/btl_wv_connect_base.c index 7c09c28c9b..2030144d7c 100644 --- a/ompi/mca/btl/wv/connect/btl_wv_connect_base.c +++ b/ompi/mca/btl/wv/connect/btl_wv_connect_base.c @@ -1,6 +1,8 @@ /* * Copyright (c) 2007-2009 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2007 Mellanox Technologies, Inc. All rights reserved. + * Copyright (c) 2012 Los Alamos National Security, LLC. All rights + * reserved. * * $COPYRIGHT$ * @@ -19,7 +21,6 @@ #include "orte/util/show_help.h" #include "opal/util/argv.h" #include "opal/util/output.h" -#include "opal/util/opal_sos.h" /* * Array of all possible connection functions @@ -183,7 +184,7 @@ int ompi_btl_wv_connect_base_init(void) opal_output(-1, "found available cpc (SUCCESS init): %s", all[i]->cbc_name); continue; - } else if (OMPI_ERR_NOT_SUPPORTED == OPAL_SOS_GET_ERROR_CODE(rc)) { + } else if (OMPI_ERR_NOT_SUPPORTED == rc) { continue; } else { return rc; @@ -229,8 +230,8 @@ int ompi_btl_wv_connect_base_select_for_local_port(mca_btl_wv_module_t *btl) strcat(msg, available[i]->cbc_name); rc = available[i]->cbc_query(btl, &cpcs[cpc_index]); - if (OMPI_ERR_NOT_SUPPORTED == OPAL_SOS_GET_ERROR_CODE(rc) || - OMPI_ERR_UNREACH == OPAL_SOS_GET_ERROR_CODE(rc)) { + if (OMPI_ERR_NOT_SUPPORTED == rc || + OMPI_ERR_UNREACH == rc) { continue; } else if (OMPI_SUCCESS != rc) { free(cpcs); diff --git a/ompi/mca/btl/wv/connect/btl_wv_connect_oob.c b/ompi/mca/btl/wv/connect/btl_wv_connect_oob.c index 39ed2e2b69..07e2ad1e35 100644 --- a/ompi/mca/btl/wv/connect/btl_wv_connect_oob.c +++ b/ompi/mca/btl/wv/connect/btl_wv_connect_oob.c @@ -10,7 +10,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2006-2009 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2006 Los Alamos National Security, LLC. All rights + * Copyright (c) 2006-2012 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2008-2009 Mellanox Technologies. All rights reserved. * Copyright (c) 2009 IBM Corporation. All rights reserved. @@ -28,7 +28,6 @@ #include "orte/util/show_help.h" #include "opal/util/error.h" #include "opal/util/output.h" -#include "opal/util/opal_sos.h" #include "orte/mca/rml/rml.h" #include "orte/mca/rml/rml_types.h" #include "orte/mca/errmgr/errmgr.h" diff --git a/ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c b/ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c index 33649ae2d4..e8bea87cc4 100644 --- a/ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c +++ b/ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c @@ -702,7 +702,6 @@ OBJ_CLASS_INSTANCE(ompi_crcp_bkmrk_pml_peer_ref_t, void ompi_crcp_bkmrk_pml_peer_ref_construct(ompi_crcp_bkmrk_pml_peer_ref_t *peer_ref) { peer_ref->proc_name.jobid = ORTE_JOBID_INVALID; peer_ref->proc_name.vpid = ORTE_VPID_INVALID; - ORTE_EPOCH_SET(peer_ref->proc_name.epoch,ORTE_EPOCH_MIN); OBJ_CONSTRUCT(&peer_ref->send_list, opal_list_t); OBJ_CONSTRUCT(&peer_ref->isend_list, opal_list_t); @@ -730,7 +729,6 @@ void ompi_crcp_bkmrk_pml_peer_ref_destruct( ompi_crcp_bkmrk_pml_peer_ref_t *peer peer_ref->proc_name.jobid = ORTE_JOBID_INVALID; peer_ref->proc_name.vpid = ORTE_VPID_INVALID; - ORTE_EPOCH_SET(peer_ref->proc_name.epoch,ORTE_EPOCH_MIN); while( NULL != (item = opal_list_remove_first(&peer_ref->send_list)) ) { HOKE_TRAFFIC_MSG_REF_RETURN(item); @@ -840,7 +838,6 @@ void ompi_crcp_bkmrk_pml_traffic_message_ref_construct(ompi_crcp_bkmrk_pml_traff msg_ref->proc_name.jobid = ORTE_JOBID_INVALID; msg_ref->proc_name.vpid = ORTE_VPID_INVALID; - ORTE_EPOCH_SET(msg_ref->proc_name.epoch,ORTE_EPOCH_MIN); msg_ref->matched = INVALID_INT; msg_ref->done = INVALID_INT; @@ -868,7 +865,6 @@ void ompi_crcp_bkmrk_pml_traffic_message_ref_destruct( ompi_crcp_bkmrk_pml_traff msg_ref->proc_name.jobid = ORTE_JOBID_INVALID; msg_ref->proc_name.vpid = ORTE_VPID_INVALID; - ORTE_EPOCH_SET(msg_ref->proc_name.epoch,ORTE_EPOCH_MIN); msg_ref->matched = INVALID_INT; msg_ref->done = INVALID_INT; @@ -902,7 +898,6 @@ void ompi_crcp_bkmrk_pml_drain_message_ref_construct(ompi_crcp_bkmrk_pml_drain_m msg_ref->proc_name.jobid = ORTE_JOBID_INVALID; msg_ref->proc_name.vpid = ORTE_VPID_INVALID; - ORTE_EPOCH_SET(msg_ref->proc_name.epoch,ORTE_EPOCH_MIN); msg_ref->done = INVALID_INT; msg_ref->active = INVALID_INT; @@ -934,7 +929,6 @@ void ompi_crcp_bkmrk_pml_drain_message_ref_destruct( ompi_crcp_bkmrk_pml_drain_m msg_ref->proc_name.jobid = ORTE_JOBID_INVALID; msg_ref->proc_name.vpid = ORTE_VPID_INVALID; - ORTE_EPOCH_SET(msg_ref->proc_name.epoch,ORTE_EPOCH_MIN); msg_ref->done = INVALID_INT; msg_ref->active = INVALID_INT; @@ -954,7 +948,6 @@ void ompi_crcp_bkmrk_pml_drain_message_ack_ref_construct(ompi_crcp_bkmrk_pml_dra msg_ack_ref->peer.jobid = ORTE_JOBID_INVALID; msg_ack_ref->peer.vpid = ORTE_VPID_INVALID; - ORTE_EPOCH_SET(msg_ack_ref->peer.epoch,ORTE_EPOCH_MIN); } void ompi_crcp_bkmrk_pml_drain_message_ack_ref_destruct( ompi_crcp_bkmrk_pml_drain_message_ack_ref_t *msg_ack_ref) { @@ -962,7 +955,6 @@ void ompi_crcp_bkmrk_pml_drain_message_ack_ref_destruct( ompi_crcp_bkmrk_pml_dra msg_ack_ref->peer.jobid = ORTE_JOBID_INVALID; msg_ack_ref->peer.vpid = ORTE_VPID_INVALID; - ORTE_EPOCH_SET(msg_ack_ref->peer.epoch,ORTE_EPOCH_MIN); } @@ -1034,7 +1026,6 @@ do { \ \ msg_ref->proc_name.jobid = p_jobid; \ msg_ref->proc_name.vpid = p_vpid; \ - ORTE_EPOCH_SET(msg_ref->proc_name.epoch,orte_ess.proc_get_epoch(&(msg_ref->proc_name))); \ \ msg_ref->matched = 0; \ msg_ref->done = 0; \ @@ -1063,7 +1054,6 @@ do { \ \ msg_ref->proc_name.jobid = p_jobid; \ msg_ref->proc_name.vpid = p_vpid; \ - ORTE_EPOCH_SET(msg_ref->proc_name.epoch,orte_ess.proc_get_epoch(&(msg_ref->proc_name))); \ } @@ -1466,7 +1456,6 @@ ompi_crcp_base_pml_state_t* ompi_crcp_bkmrk_pml_add_procs( new_peer_ref->proc_name.jobid = procs[i]->proc_name.jobid; new_peer_ref->proc_name.vpid = procs[i]->proc_name.vpid; - ORTE_EPOCH_SET(new_peer_ref->proc_name.epoch,procs[i]->proc_name.epoch); opal_list_append(&ompi_crcp_bkmrk_pml_peer_refs, &(new_peer_ref->super)); } @@ -3375,7 +3364,6 @@ static int traffic_message_move(ompi_crcp_bkmrk_pml_traffic_message_ref_t *old_m if( NULL == from_peer_ref && NULL != to_peer_ref ) { (*new_msg_ref)->proc_name.jobid = to_peer_ref->proc_name.jobid; (*new_msg_ref)->proc_name.vpid = to_peer_ref->proc_name.vpid; - ORTE_EPOCH_SET((*new_msg_ref)->proc_name.epoch,to_peer_ref->proc_name.epoch); } return exit_status; @@ -5281,7 +5269,6 @@ static int send_bookmarks(int peer_idx) */ peer_name.jobid = ORTE_PROC_MY_NAME->jobid; peer_name.vpid = peer_idx; - ORTE_EPOCH_SET(peer_name.epoch,orte_ess.proc_get_epoch(&peer_name)); if( NULL == (peer_ref = find_peer(peer_name))) { opal_output(mca_crcp_bkmrk_component.super.output_handle, @@ -5342,7 +5329,6 @@ static int recv_bookmarks(int peer_idx) peer_name.jobid = ORTE_PROC_MY_NAME->jobid; peer_name.vpid = peer_idx; - ORTE_EPOCH_SET(peer_name.epoch,orte_ess.proc_get_epoch(&peer_name)); if ( 0 > (ret = orte_rml.recv_buffer_nb(&peer_name, OMPI_CRCP_COORD_BOOKMARK_TAG, @@ -5524,7 +5510,6 @@ static int send_msg_details(ompi_crcp_bkmrk_pml_peer_ref_t *peer_ref, HOKE_DRAIN_ACK_MSG_REF_ALLOC(d_msg_ack, ret); d_msg_ack->peer.jobid = peer_ref->proc_name.jobid; d_msg_ack->peer.vpid = peer_ref->proc_name.vpid; - ORTE_EPOCH_SET(d_msg_ack->peer.epoch,peer_ref->proc_name.epoch); d_msg_ack->complete = false; opal_list_append(&drained_msg_ack_list, &(d_msg_ack->super)); diff --git a/ompi/mca/dpm/base/dpm_base_select.c b/ompi/mca/dpm/base/dpm_base_select.c index f22f9d0ec6..09764183cd 100644 --- a/ompi/mca/dpm/base/dpm_base_select.c +++ b/ompi/mca/dpm/base/dpm_base_select.c @@ -7,6 +7,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2012 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -18,7 +20,6 @@ #include "opal/mca/mca.h" #include "opal/mca/base/base.h" -#include "opal/util/opal_sos.h" #include "opal/mca/base/mca_base_param.h" #include "opal/mca/base/mca_base_component_repository.h" @@ -41,7 +42,7 @@ int ompi_dpm_base_select(void) (mca_base_module_t **) &best_module, (mca_base_component_t **) &best_component))) { /* it is okay not to find any executable components */ - if (OMPI_ERR_NOT_FOUND == OPAL_SOS_GET_ERROR_CODE(ret)) { + if (OMPI_ERR_NOT_FOUND == ret) { ret = OPAL_SUCCESS; } goto cleanup; diff --git a/ompi/mca/dpm/orte/dpm_orte.c b/ompi/mca/dpm/orte/dpm_orte.c index 71d54fd15f..613d73c611 100644 --- a/ompi/mca/dpm/orte/dpm_orte.c +++ b/ompi/mca/dpm/orte/dpm_orte.c @@ -12,6 +12,8 @@ * Copyright (c) 2007-2011 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2006-2009 University of Houston. All rights reserved. * Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -28,7 +30,6 @@ #include "opal/util/argv.h" #include "opal/util/opal_getcwd.h" -#include "opal/util/opal_sos.h" #include "opal/dss/dss.h" #include "orte/mca/errmgr/errmgr.h" @@ -65,7 +66,6 @@ static orte_process_name_t carport; static void recv_cb(int status, orte_process_name_t* sender, opal_buffer_t *buffer, orte_rml_tag_t tag, void *cbdata); -static void process_cb(int fd, short event, void *data); /* API functions */ static int init(void); @@ -104,6 +104,13 @@ ompi_dpm_base_module_t ompi_dpm_orte_module = { finalize }; +static void rml_cbfunc(int status, orte_process_name_t* sender, + opal_buffer_t* buffer, orte_rml_tag_t tag, + void* cbdata) +{ + OBJ_RELEASE(buffer); +} + /* * Init the module @@ -136,7 +143,11 @@ static int connect_accept ( ompi_communicator_t *comm, int root, int i,j, new_proc_len; ompi_group_t *new_group_pointer; - + orte_grpcomm_coll_id_t id; + orte_grpcomm_collective_t modex; + opal_list_item_t *item; + orte_namelist_t *nm; + OPAL_OUTPUT_VERBOSE((1, ompi_dpm_base_output, "%s dpm:orte:connect_accept with port %s %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), @@ -179,6 +190,65 @@ static int connect_accept ( ompi_communicator_t *comm, int root, opal_progress_event_users_increment(); if ( rank == root ) { + if (send_first) { + /* Get a collective id for the modex we need later on - we + * have to get a globally unique id for this purpose as + * multiple threads can do simultaneous connect/accept, + * and the same processes can be engaged in multiple + * connect/accepts at the same time. Only one side + * needs to do this, so have it be send_first + */ + nbuf = OBJ_NEW(opal_buffer_t); + if (NULL == nbuf) { + return OMPI_ERROR; + } + /* send the request - doesn't have to include any data */ + rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, nbuf, ORTE_RML_TAG_COLL_ID_REQ, 0, rml_cbfunc, NULL); + /* wait for the id */ + recv_completed = false; + cabuf = OBJ_NEW(opal_buffer_t); + rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_COLL_ID, + ORTE_RML_NON_PERSISTENT, recv_cb, NULL); + /* wait for response */ + while (!recv_completed) { + opal_progress(); + } + i=1; + if (OPAL_SUCCESS != (rc = opal_dss.unpack(cabuf, &id, &i, ORTE_GRPCOMM_COLL_ID_T))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(cabuf); + return OMPI_ERROR; + } + OBJ_RELEASE(cabuf); + /* send it to my peer on the other side */ + nbuf = OBJ_NEW(opal_buffer_t); + if (NULL == nbuf) { + return OMPI_ERROR; + } + if (ORTE_SUCCESS != (rc = opal_dss.pack(nbuf, &id, 1, ORTE_GRPCOMM_COLL_ID_T))) { + ORTE_ERROR_LOG(rc); + goto exit; + } + rc = orte_rml.send_buffer_nb(&port, nbuf, tag, 0, rml_cbfunc, NULL); + } else { + /* wait to recv the collective id */ + recv_completed = false; + cabuf = OBJ_NEW(opal_buffer_t); + rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, tag, + ORTE_RML_NON_PERSISTENT, recv_cb, NULL); + /* wait for response */ + while (!recv_completed) { + opal_progress(); + } + i=1; + if (OPAL_SUCCESS != (rc = opal_dss.unpack(cabuf, &id, &i, ORTE_GRPCOMM_COLL_ID_T))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(cabuf); + return OMPI_ERROR; + } + OBJ_RELEASE(cabuf); + } + /* Generate the message buffer containing the number of processes and the list of participating processes */ nbuf = OBJ_NEW(opal_buffer_t); @@ -186,6 +256,12 @@ static int connect_accept ( ompi_communicator_t *comm, int root, return OMPI_ERROR; } + /* pass the collective id so we can all use it */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(nbuf, &id, 1, ORTE_GRPCOMM_COLL_ID_T))) { + ORTE_ERROR_LOG(rc); + goto exit; + } + if (OPAL_SUCCESS != (rc = opal_dss.pack(nbuf, &size, 1, OPAL_INT))) { ORTE_ERROR_LOG(rc); goto exit; @@ -244,7 +320,9 @@ static int connect_accept ( ompi_communicator_t *comm, int root, rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, tag, ORTE_RML_NON_PERSISTENT, recv_cb, NULL); /* wait for response */ - ORTE_PROGRESSED_WAIT(recv_completed, 0, 1); + while (!recv_completed) { + opal_progress(); + } OPAL_OUTPUT_VERBOSE((3, ompi_dpm_base_output, "%s dpm:orte:connect_accept got data from %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), @@ -259,7 +337,9 @@ static int connect_accept ( ompi_communicator_t *comm, int root, rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, tag, ORTE_RML_NON_PERSISTENT, recv_cb, NULL); /* wait for response */ - ORTE_PROGRESSED_WAIT(recv_completed, 0, 1); + while (!recv_completed) { + opal_progress(); + } /* now send our info */ OPAL_OUTPUT_VERBOSE((3, ompi_dpm_base_output, "%s dpm:orte:connect_accept sending info to %s", @@ -324,6 +404,13 @@ static int connect_accept ( ompi_communicator_t *comm, int root, goto exit; } + /* unload the collective id */ + num_vals = 1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(nrbuf, &id, &num_vals, ORTE_GRPCOMM_COLL_ID_T))) { + ORTE_ERROR_LOG(rc); + goto exit; + } + num_vals = 1; if (OPAL_SUCCESS != (rc = opal_dss.unpack(nrbuf, &rsize, &num_vals, OPAL_INT))) { ORTE_ERROR_LOG(rc); @@ -360,7 +447,7 @@ static int connect_accept ( ompi_communicator_t *comm, int root, for (i = 0 ; i < rsize ; ++i) { name = OBJ_NEW(orte_namelist_t); name->name = rprocs[i]->proc_name; - opal_list_append(&all_procs, &name->item); + opal_list_append(&all_procs, &name->super); OPAL_OUTPUT_VERBOSE((3, ompi_dpm_base_output, "%s dpm:orte:connect_accept send first adding %s to allgather list", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), @@ -369,7 +456,7 @@ static int connect_accept ( ompi_communicator_t *comm, int root, for (i = 0 ; i < group->grp_proc_count ; ++i) { name = OBJ_NEW(orte_namelist_t); name->name = ompi_group_peer_lookup(group, i)->proc_name; - opal_list_append(&all_procs, &name->item); + opal_list_append(&all_procs, &name->super); OPAL_OUTPUT_VERBOSE((3, ompi_dpm_base_output, "%s dpm:orte:connect_accept send first adding %s to allgather list", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), @@ -380,7 +467,7 @@ static int connect_accept ( ompi_communicator_t *comm, int root, for (i = 0 ; i < group->grp_proc_count ; ++i) { name = OBJ_NEW(orte_namelist_t); name->name = ompi_group_peer_lookup(group, i)->proc_name; - opal_list_append(&all_procs, &name->item); + opal_list_append(&all_procs, &name->super); OPAL_OUTPUT_VERBOSE((3, ompi_dpm_base_output, "%s dpm:orte:connect_accept recv first adding %s to allgather list", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), @@ -389,7 +476,7 @@ static int connect_accept ( ompi_communicator_t *comm, int root, for (i = 0 ; i < rsize ; ++i) { name = OBJ_NEW(orte_namelist_t); name->name = rprocs[i]->proc_name; - opal_list_append(&all_procs, &name->item); + opal_list_append(&all_procs, &name->super); OPAL_OUTPUT_VERBOSE((3, ompi_dpm_base_output, "%s dpm:orte:connect_accept recv first adding %s to allgather list", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), @@ -402,10 +489,28 @@ static int connect_accept ( ompi_communicator_t *comm, int root, "%s dpm:orte:connect_accept executing modex", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - if (ORTE_SUCCESS != (rc = orte_grpcomm.modex(&all_procs))) { + /* setup the modex */ + OBJ_CONSTRUCT(&modex, orte_grpcomm_collective_t); + modex.id = id; + /* copy across the list of participants */ + for (item = opal_list_get_first(&all_procs); + item != opal_list_get_end(&all_procs); + item = opal_list_get_next(item)) { + nm = (orte_namelist_t*)item; + name = OBJ_NEW(orte_namelist_t); + name->name = nm->name; + opal_list_append(&modex.participants, &name->super); + } + + /* perform it */ + if (OMPI_SUCCESS != (rc = orte_grpcomm.modex(&modex))) { ORTE_ERROR_LOG(rc); goto exit; } + while (modex.active) { + opal_progress(); + } + OBJ_DESTRUCT(&modex); OPAL_OUTPUT_VERBOSE((3, ompi_dpm_base_output, "%s dpm:orte:connect_accept modex complete", @@ -1521,33 +1626,12 @@ static void recv_cb(int status, orte_process_name_t* sender, opal_buffer_t *buffer, orte_rml_tag_t tag, void *cbdata) { - /* don't process this right away - we need to get out of the recv before - * we process the message as it may ask us to do something that involves - * more messaging! Instead, setup an event so that the message gets processed - * as soon as we leave the recv. - * - * The macro makes a copy of the buffer, which we release when processed - the incoming - * buffer, however, is NOT released here, although its payload IS transferred - * to the message buffer for later processing - */ - ORTE_MESSAGE_EVENT(sender, buffer, tag, process_cb); - - -} -static void process_cb(int fd, short event, void *data) -{ - orte_message_event_t *mev = (orte_message_event_t*)data; - /* copy the payload to the global buffer */ - opal_dss.copy_payload(cabuf, mev->buffer); + opal_dss.copy_payload(cabuf, buffer); /* flag the identity of the remote proc */ - carport.jobid = mev->sender.jobid; - carport.vpid = mev->sender.vpid; - ORTE_EPOCH_SET(carport.epoch,mev->sender.epoch); - - /* release the event */ - OBJ_RELEASE(mev); + carport.jobid = sender->jobid; + carport.vpid = sender->vpid; /* flag complete */ recv_completed = true; diff --git a/ompi/mca/osc/pt2pt/osc_pt2pt_sync.c b/ompi/mca/osc/pt2pt/osc_pt2pt_sync.c index da77c1cf84..92345fa2a6 100644 --- a/ompi/mca/osc/pt2pt/osc_pt2pt_sync.c +++ b/ompi/mca/osc/pt2pt/osc_pt2pt_sync.c @@ -7,6 +7,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2012 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -24,7 +26,6 @@ #include "mpi.h" #include "opal/runtime/opal_progress.h" #include "opal/threads/mutex.h" -#include "opal/util/opal_sos.h" #include "ompi/communicator/communicator.h" #include "ompi/mca/osc/base/base.h" @@ -122,7 +123,7 @@ ompi_osc_pt2pt_module_fence(int assert, ompi_win_t *win) ret = ompi_osc_pt2pt_sendreq_send(module, req); - if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(ret) ) { + if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == ret) { opal_output_verbose(5, ompi_osc_base_output, "complete: failure in starting sendreq (%d). Will try later.", ret); @@ -267,7 +268,7 @@ ompi_osc_pt2pt_module_complete(ompi_win_t *win) ret = ompi_osc_pt2pt_sendreq_send(module, req); - if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(ret) ) { + if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == ret) { opal_output_verbose(5, ompi_osc_base_output, "complete: failure in starting sendreq (%d). Will try later.", ret); @@ -490,7 +491,7 @@ ompi_osc_pt2pt_module_unlock(int target, ret = ompi_osc_pt2pt_sendreq_send(module, req); - if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(ret) ) { + if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == ret) { opal_output_verbose(5, ompi_osc_base_output, "complete: failure in starting sendreq (%d). Will try later.", ret); diff --git a/ompi/mca/osc/rdma/osc_rdma_comm.c b/ompi/mca/osc/rdma/osc_rdma_comm.c index 75d9bc821c..3642295cc4 100644 --- a/ompi/mca/osc/rdma/osc_rdma_comm.c +++ b/ompi/mca/osc/rdma/osc_rdma_comm.c @@ -7,7 +7,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2007 Los Alamos National Security, LLC. All rights + * Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ @@ -137,7 +137,7 @@ ompi_osc_rdma_module_accumulate(void *origin_addr, int origin_count, ret = ompi_osc_rdma_sendreq_send(module, sendreq); - if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(ret)) { + if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == ret) { OPAL_THREAD_LOCK(&module->m_lock); sendreq->req_module->m_num_pending_out -= 1; opal_list_append(&(module->m_pending_sendreqs), @@ -209,7 +209,7 @@ ompi_osc_rdma_module_get(void *origin_addr, ret = ompi_osc_rdma_sendreq_send(module, sendreq); - if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(ret)) { + if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == ret) { OPAL_THREAD_LOCK(&module->m_lock); sendreq->req_module->m_num_pending_out -= 1; opal_list_append(&(module->m_pending_sendreqs), @@ -278,7 +278,7 @@ ompi_osc_rdma_module_put(void *origin_addr, int origin_count, ret = ompi_osc_rdma_sendreq_send(module, sendreq); - if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(ret)) { + if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == ret) { OPAL_THREAD_LOCK(&module->m_lock); sendreq->req_module->m_num_pending_out -= 1; opal_list_append(&(module->m_pending_sendreqs), diff --git a/ompi/mca/osc/rdma/osc_rdma_sync.c b/ompi/mca/osc/rdma/osc_rdma_sync.c index e792299eb0..12b3c0e00c 100644 --- a/ompi/mca/osc/rdma/osc_rdma_sync.c +++ b/ompi/mca/osc/rdma/osc_rdma_sync.c @@ -7,7 +7,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2007 Los Alamos National Security, LLC. All rights + * Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2010 IBM Corporation. All rights reserved. * $COPYRIGHT$ @@ -140,7 +140,7 @@ ompi_osc_rdma_module_fence(int assert, ompi_win_t *win) opal_list_remove_first(&(module->m_copy_pending_sendreqs)); ret = ompi_osc_rdma_sendreq_send(module, req); - if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(ret)) { + if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == ret) { opal_list_append(&(module->m_copy_pending_sendreqs), (opal_list_item_t*)req); } else if (OMPI_SUCCESS != ret) { return ret; @@ -355,7 +355,7 @@ ompi_osc_rdma_module_complete(ompi_win_t *win) (ompi_osc_rdma_sendreq_t*) item; ret = ompi_osc_rdma_sendreq_send(module, req); - if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(ret)) { + if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == ret) { opal_list_append(&(module->m_copy_pending_sendreqs), item); break; } else if (OMPI_SUCCESS != ret) { @@ -589,7 +589,7 @@ ompi_osc_rdma_module_unlock(int target, (ompi_osc_rdma_sendreq_t*) item; ret = ompi_osc_rdma_sendreq_send(module, req); - if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(ret)) { + if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == ret) { opal_list_append(&(module->m_copy_pending_sendreqs), item); break; } else if (OMPI_SUCCESS != ret) { diff --git a/ompi/mca/pml/base/pml_base_select.c b/ompi/mca/pml/base/pml_base_select.c index 1906ba7085..fd00dcb756 100644 --- a/ompi/mca/pml/base/pml_base_select.c +++ b/ompi/mca/pml/base/pml_base_select.c @@ -10,6 +10,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2012 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -25,7 +27,6 @@ #include "opal/class/opal_list.h" #include "opal/util/output.h" -#include "opal/util/opal_sos.h" #include "orte/util/show_help.h" #include "opal/runtime/opal_progress.h" #include "opal/mca/mca.h" @@ -354,7 +355,7 @@ mca_pml_base_pml_check_selected(const char *my_pml, (void**) &remote_pml, &size); /* if modex isn't implemented, then just assume all is well... */ - if (OMPI_ERR_NOT_IMPLEMENTED == OPAL_SOS_GET_ERROR_CODE(ret)) { + if (OMPI_ERR_NOT_IMPLEMENTED == ret) { opal_output_verbose( 10, mca_pml_base_output, "check:select: modex not implemented"); return OMPI_SUCCESS; diff --git a/ompi/mca/pml/bfo/pml_bfo.c b/ompi/mca/pml/bfo/pml_bfo.c index 6eb5b7c755..879d4b5047 100644 --- a/ompi/mca/pml/bfo/pml_bfo.c +++ b/ompi/mca/pml/bfo/pml_bfo.c @@ -14,6 +14,8 @@ * Copyright (c) 2006-2008 University of Houston. All rights reserved. * Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 Sandia National Laboratories. All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -587,7 +589,7 @@ void mca_pml_bfo_process_pending_packets(mca_bml_base_btl_t* bml_btl) pckt->hdr.hdr_ack.hdr_dst_req.pval, pckt->hdr.hdr_ack.hdr_send_offset, pckt->hdr.hdr_common.hdr_flags & MCA_PML_BFO_HDR_FLAGS_NORDMA); - if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) ) { + if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == rc) ) { OPAL_THREAD_LOCK(&mca_pml_bfo.lock); opal_list_append(&mca_pml_bfo.pckt_pending, (opal_list_item_t*)pckt); @@ -608,7 +610,7 @@ void mca_pml_bfo_process_pending_packets(mca_bml_base_btl_t* bml_btl) #else /* PML_BFO */ pckt->hdr.hdr_fin.hdr_fail); #endif /* PML_BFO */ - if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) ) { + if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == rc) ) { return; } break; @@ -640,7 +642,7 @@ void mca_pml_bfo_process_pending_rdma(void) } else { rc = mca_pml_bfo_recv_request_get_frag(frag); } - if(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) + if(OMPI_ERR_OUT_OF_RESOURCE == rc) break; } } diff --git a/ompi/mca/pml/bfo/pml_bfo_failover.c b/ompi/mca/pml/bfo/pml_bfo_failover.c index ede5c8bf4f..fee412f9db 100644 --- a/ompi/mca/pml/bfo/pml_bfo_failover.c +++ b/ompi/mca/pml/bfo/pml_bfo_failover.c @@ -1,5 +1,7 @@ /* * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -839,7 +841,7 @@ void mca_pml_bfo_send_request_restart(mca_pml_bfo_send_request_t* sendreq, /* select a btl */ bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager); rc = mca_pml_bfo_send_request_start_btl(sendreq, bml_btl); - if(OPAL_LIKELY(OMPI_ERR_OUT_OF_RESOURCE != OPAL_SOS_GET_ERROR_CODE(rc))) + if(OPAL_LIKELY(OMPI_ERR_OUT_OF_RESOURCE != rc)) return; } add_request_to_send_pending(sendreq, MCA_PML_BFO_SEND_PENDING_START, true); @@ -897,7 +899,7 @@ void mca_pml_bfo_repost_match_fragment(struct mca_btl_base_descriptor_t* des) rc = mca_pml_bfo_send_request_start_btl(sendreq, bml_btl); if (OMPI_SUCCESS == rc) { return; - } else if (OMPI_ERR_OUT_OF_RESOURCE == (OPAL_SOS_GET_ERROR_CODE(rc))) { + } else if (OMPI_ERR_OUT_OF_RESOURCE == rc) { opal_output_verbose(30, mca_pml_bfo_output, "Warning: delaying reposting of BFO_HDR_TYPE_MATCH, btls=%d", (int)sendreq->req_endpoint->btl_eager.arr_size); diff --git a/ompi/mca/pml/bfo/pml_bfo_recvreq.c b/ompi/mca/pml/bfo/pml_bfo_recvreq.c index f032ade8bd..033f460a0e 100644 --- a/ompi/mca/pml/bfo/pml_bfo_recvreq.c +++ b/ompi/mca/pml/bfo/pml_bfo_recvreq.c @@ -11,6 +11,8 @@ * All rights reserved. * Copyright (c) 2008 UT-Battelle, LLC. All rights reserved. * Copyright (c) 2010-2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -56,7 +58,7 @@ void mca_pml_bfo_recv_request_process_pending(void) break; recvreq->req_pending = false; rc = mca_pml_bfo_recv_request_schedule_exclusive(recvreq, NULL); - if(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) + if(OMPI_ERR_OUT_OF_RESOURCE == rc) break; } } @@ -433,7 +435,7 @@ int mca_pml_bfo_recv_request_get_frag( mca_pml_bfo_rdma_frag_t* frag ) /* queue up get request */ rc = mca_bml_base_get(bml_btl,descriptor); if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) { - if(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) { + if(OMPI_ERR_OUT_OF_RESOURCE == rc) { mca_bml_base_free(bml_btl, descriptor); OPAL_THREAD_LOCK(&mca_pml_bfo.lock); opal_list_append(&mca_pml_bfo.rdma_pending, diff --git a/ompi/mca/pml/bfo/pml_bfo_recvreq.h b/ompi/mca/pml/bfo/pml_bfo_recvreq.h index 69d12b0434..be9e1c441e 100644 --- a/ompi/mca/pml/bfo/pml_bfo_recvreq.h +++ b/ompi/mca/pml/bfo/pml_bfo_recvreq.h @@ -11,6 +11,8 @@ * All rights reserved. * Copyright (c) 2008 UT-Battelle, LLC. All rights reserved. * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -369,7 +371,7 @@ static inline int mca_pml_bfo_recv_request_schedule_exclusive( do { rc = mca_pml_bfo_recv_request_schedule_once(req, start_bml_btl); - if(OPAL_SOS_GET_ERROR_CODE(rc) == OMPI_ERR_OUT_OF_RESOURCE) + if(rc == OMPI_ERR_OUT_OF_RESOURCE) break; } while(!unlock_recv_request(req)); diff --git a/ompi/mca/pml/bfo/pml_bfo_sendreq.c b/ompi/mca/pml/bfo/pml_bfo_sendreq.c index 63549a57d8..547514b971 100644 --- a/ompi/mca/pml/bfo/pml_bfo_sendreq.c +++ b/ompi/mca/pml/bfo/pml_bfo_sendreq.c @@ -11,6 +11,8 @@ * All rights reserved. * Copyright (c) 2008 UT-Battelle, LLC. All rights reserved. * Copyright (c) 2010-2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -57,7 +59,7 @@ void mca_pml_bfo_send_request_process_pending(mca_bml_base_btl_t *bml_btl) switch(pending_type) { case MCA_PML_BFO_SEND_PENDING_SCHEDULE: rc = mca_pml_bfo_send_request_schedule_exclusive(sendreq); - if(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) { + if(OMPI_ERR_OUT_OF_RESOURCE == rc) { return; } break; @@ -70,7 +72,7 @@ void mca_pml_bfo_send_request_process_pending(mca_bml_base_btl_t *bml_btl) MCA_PML_BFO_SEND_PENDING_START, true); } else { rc = mca_pml_bfo_send_request_start_btl(sendreq, send_dst); - if (OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) { + if (OMPI_ERR_OUT_OF_RESOURCE == rc) { /* No more resources on this btl so prepend to the pending * list to minimize reordering and give up for now. */ add_request_to_send_pending(sendreq, @@ -618,8 +620,7 @@ int mca_pml_bfo_send_request_start_copy( mca_pml_bfo_send_request_t* sendreq, } return OMPI_SUCCESS; } - - if (OMPI_ERR_RESOURCE_BUSY == OPAL_SOS_GET_ERROR_CODE(rc)) { + if (OMPI_ERR_RESOURCE_BUSY == rc) { /* No more resources. Allow the upper level to queue the send */ rc = OMPI_ERR_OUT_OF_RESOURCE; } @@ -1311,7 +1312,7 @@ int mca_pml_bfo_send_request_put_frag( mca_pml_bfo_rdma_frag_t* frag ) if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) { mca_bml_base_free(bml_btl, des); frag->rdma_length = save_size; - if(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) { + if(OMPI_ERR_OUT_OF_RESOURCE == rc) { OPAL_THREAD_LOCK(&mca_pml_bfo.lock); opal_list_append(&mca_pml_bfo.rdma_pending, (opal_list_item_t*)frag); OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock); diff --git a/ompi/mca/pml/bfo/pml_bfo_sendreq.h b/ompi/mca/pml/bfo/pml_bfo_sendreq.h index 52d8b896b7..3ef8900458 100644 --- a/ompi/mca/pml/bfo/pml_bfo_sendreq.h +++ b/ompi/mca/pml/bfo/pml_bfo_sendreq.h @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2009-2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -293,7 +295,7 @@ mca_pml_bfo_send_request_schedule_exclusive(mca_pml_bfo_send_request_t* sendreq) int rc; do { rc = mca_pml_bfo_send_request_schedule_once(sendreq); - if(OPAL_SOS_GET_ERROR_CODE(rc) == OMPI_ERR_OUT_OF_RESOURCE) + if(rc == OMPI_ERR_OUT_OF_RESOURCE) break; } while(!unlock_send_request(sendreq)); @@ -458,7 +460,7 @@ mca_pml_bfo_send_request_start( mca_pml_bfo_send_request_t* sendreq ) /* select a btl */ bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager); rc = mca_pml_bfo_send_request_start_btl(sendreq, bml_btl); - if( OPAL_LIKELY(OMPI_ERR_OUT_OF_RESOURCE != OPAL_SOS_GET_ERROR_CODE(rc)) ) + if( OPAL_LIKELY(OMPI_ERR_OUT_OF_RESOURCE != rc) ) return rc; } add_request_to_send_pending(sendreq, MCA_PML_BFO_SEND_PENDING_START, true); diff --git a/ompi/mca/pml/csum/pml_csum.c b/ompi/mca/pml/csum/pml_csum.c index 7935ce3971..4ae381e398 100644 --- a/ompi/mca/pml/csum/pml_csum.c +++ b/ompi/mca/pml/csum/pml_csum.c @@ -13,7 +13,7 @@ * Copyright (c) 2008 UT-Battelle, LLC. All rights reserved. * Copyright (c) 2006-2008 University of Houston. All rights reserved. * Copyright (c) 2009 IBM Corporation. All rights reserved. - * Copyright (c) 2009 Los Alamos National Security, LLC. All rights + * Copyright (c) 2009-2012 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved * Copyright (c) 2011 Sandia National Laboratories. All rights reserved. @@ -586,7 +586,7 @@ void mca_pml_csum_process_pending_packets(mca_bml_base_btl_t* bml_btl) pckt->hdr.hdr_ack.hdr_dst_req.pval, pckt->hdr.hdr_ack.hdr_send_offset, pckt->hdr.hdr_common.hdr_flags & MCA_PML_CSUM_HDR_FLAGS_NORDMA); - if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) ) { + if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == rc) ) { OPAL_THREAD_LOCK(&mca_pml_csum.lock); opal_list_append(&mca_pml_csum.pckt_pending, (opal_list_item_t*)pckt); @@ -599,7 +599,7 @@ void mca_pml_csum_process_pending_packets(mca_bml_base_btl_t* bml_btl) pckt->hdr.hdr_fin.hdr_des, pckt->order, pckt->hdr.hdr_fin.hdr_fail); - if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) ) { + if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == rc) ) { return; } break; @@ -631,7 +631,7 @@ void mca_pml_csum_process_pending_rdma(void) } else { rc = mca_pml_csum_recv_request_get_frag(frag); } - if(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) + if(OMPI_ERR_OUT_OF_RESOURCE == rc) break; } } diff --git a/ompi/mca/pml/csum/pml_csum_recvreq.c b/ompi/mca/pml/csum/pml_csum_recvreq.c index e0bc7e8d89..1ab808b963 100644 --- a/ompi/mca/pml/csum/pml_csum_recvreq.c +++ b/ompi/mca/pml/csum/pml_csum_recvreq.c @@ -11,7 +11,7 @@ * All rights reserved. * Copyright (c) 2008 UT-Battelle, LLC. All rights reserved. * Copyright (c) 2009 IBM Corporation. All rights reserved. - * Copyright (c) 2009 Los Alamos National Security, LLC. All rights + * Copyright (c) 2009-2012 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. @@ -83,7 +83,7 @@ void mca_pml_csum_recv_request_process_pending(void) break; recvreq->req_pending = false; rc = mca_pml_csum_recv_request_schedule_exclusive(recvreq, NULL); - if(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) + if(OMPI_ERR_OUT_OF_RESOURCE == rc) break; } } @@ -425,7 +425,7 @@ int mca_pml_csum_recv_request_get_frag( mca_pml_csum_rdma_frag_t* frag ) /* queue up get request */ rc = mca_bml_base_get(bml_btl,descriptor); if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) { - if(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) { + if(OMPI_ERR_OUT_OF_RESOURCE == rc) { mca_bml_base_free(bml_btl, descriptor); OPAL_THREAD_LOCK(&mca_pml_csum.lock); opal_list_append(&mca_pml_csum.rdma_pending, diff --git a/ompi/mca/pml/csum/pml_csum_recvreq.h b/ompi/mca/pml/csum/pml_csum_recvreq.h index b62d96a116..3245b82c2e 100644 --- a/ompi/mca/pml/csum/pml_csum_recvreq.h +++ b/ompi/mca/pml/csum/pml_csum_recvreq.h @@ -11,6 +11,8 @@ * All rights reserved. * Copyright (c) 2008 UT-Battelle, LLC. All rights reserved. * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -346,7 +348,7 @@ static inline int mca_pml_csum_recv_request_schedule_exclusive( do { rc = mca_pml_csum_recv_request_schedule_once(req, start_bml_btl); - if(OPAL_SOS_GET_ERROR_CODE(rc) == OMPI_ERR_OUT_OF_RESOURCE) + if(rc == OMPI_ERR_OUT_OF_RESOURCE) break; } while(!unlock_recv_request(req)); diff --git a/ompi/mca/pml/csum/pml_csum_sendreq.c b/ompi/mca/pml/csum/pml_csum_sendreq.c index aac1a2c462..758181eff6 100644 --- a/ompi/mca/pml/csum/pml_csum_sendreq.c +++ b/ompi/mca/pml/csum/pml_csum_sendreq.c @@ -11,7 +11,7 @@ * All rights reserved. * Copyright (c) 2008 UT-Battelle, LLC. All rights reserved. * Copyright (c) 2009 IBM Corporation. All rights reserved. - * Copyright (c) 2009 Los Alamos National Security, LLC. All rights + * Copyright (c) 2009-2012 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. * $COPYRIGHT$ @@ -66,7 +66,7 @@ void mca_pml_csum_send_request_process_pending(mca_bml_base_btl_t *bml_btl) switch(pending_type) { case MCA_PML_CSUM_SEND_PENDING_SCHEDULE: rc = mca_pml_csum_send_request_schedule_exclusive(sendreq); - if(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) { + if(OMPI_ERR_OUT_OF_RESOURCE == rc) { return; } break; @@ -79,7 +79,7 @@ void mca_pml_csum_send_request_process_pending(mca_bml_base_btl_t *bml_btl) MCA_PML_CSUM_SEND_PENDING_START, true); } else { rc = mca_pml_csum_send_request_start_btl(sendreq, send_dst); - if (OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) { + if (OMPI_ERR_OUT_OF_RESOURCE == rc) { /* No more resources on this btl so prepend to the pending * list to minimize reordering and give up for now. */ add_request_to_send_pending(sendreq, @@ -590,7 +590,7 @@ int mca_pml_csum_send_request_start_copy( mca_pml_csum_send_request_t* sendreq, } return OMPI_SUCCESS; } - switch(OPAL_SOS_GET_ERROR_CODE(rc)) { + switch(rc) { case OMPI_ERR_RESOURCE_BUSY: /* No more resources. Allow the upper level to queue the send */ rc = OMPI_ERR_OUT_OF_RESOURCE; @@ -1256,7 +1256,7 @@ int mca_pml_csum_send_request_put_frag( mca_pml_csum_rdma_frag_t* frag ) if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) { mca_bml_base_free(bml_btl, des); frag->rdma_length = save_size; - if(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) { + if(OMPI_ERR_OUT_OF_RESOURCE == rc) { OPAL_THREAD_LOCK(&mca_pml_csum.lock); opal_list_append(&mca_pml_csum.rdma_pending, (opal_list_item_t*)frag); OPAL_THREAD_UNLOCK(&mca_pml_csum.lock); diff --git a/ompi/mca/pml/csum/pml_csum_sendreq.h b/ompi/mca/pml/csum/pml_csum_sendreq.h index bdaeb7e841..f2e84840f7 100644 --- a/ompi/mca/pml/csum/pml_csum_sendreq.h +++ b/ompi/mca/pml/csum/pml_csum_sendreq.h @@ -10,7 +10,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2009 IBM Corporation. All rights reserved. - * Copyright (c) 2009 Los Alamos National Security, LLC. All rights + * Copyright (c) 2009-2012 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved. * $COPYRIGHT$ @@ -287,7 +287,7 @@ mca_pml_csum_send_request_schedule_exclusive(mca_pml_csum_send_request_t* sendre int rc; do { rc = mca_pml_csum_send_request_schedule_once(sendreq); - if(OPAL_SOS_GET_ERROR_CODE(rc) == OMPI_ERR_OUT_OF_RESOURCE) + if(rc == OMPI_ERR_OUT_OF_RESOURCE) break; } while(!unlock_send_request(sendreq)); @@ -434,7 +434,7 @@ mca_pml_csum_send_request_start( mca_pml_csum_send_request_t* sendreq ) /* select a btl */ bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager); rc = mca_pml_csum_send_request_start_btl(sendreq, bml_btl); - if( OPAL_LIKELY(OMPI_ERR_OUT_OF_RESOURCE != OPAL_SOS_GET_ERROR_CODE(rc)) ) + if( OPAL_LIKELY(OMPI_ERR_OUT_OF_RESOURCE != rc) ) return rc; } add_request_to_send_pending(sendreq, MCA_PML_CSUM_SEND_PENDING_START, true); diff --git a/ompi/mca/pml/dr/pml_dr_sendreq.c b/ompi/mca/pml/dr/pml_dr_sendreq.c index e0e3bdb3b4..16ce9f6266 100644 --- a/ompi/mca/pml/dr/pml_dr_sendreq.c +++ b/ompi/mca/pml/dr/pml_dr_sendreq.c @@ -11,6 +11,8 @@ * All rights reserved. * Copyright (c) 2007 Mellanox Technologies. * All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -102,7 +104,7 @@ static void mca_pml_dr_error_completion( mca_pml_dr_vfrag_t* vfrag = (mca_pml_dr_vfrag_t*)descriptor->des_cbdata; mca_pml_dr_send_request_t* sendreq = (mca_pml_dr_send_request_t*)vfrag->vf_send.pval; - switch(OPAL_SOS_GET_ERROR_CODE(status)) { + switch(status) { case OMPI_ERR_UNREACH: /** * peer is no longer reachable through this btl diff --git a/ompi/mca/pml/ob1/pml_ob1.c b/ompi/mca/pml/ob1/pml_ob1.c index bbc84ec985..ff2aa14d3e 100644 --- a/ompi/mca/pml/ob1/pml_ob1.c +++ b/ompi/mca/pml/ob1/pml_ob1.c @@ -14,6 +14,8 @@ * Copyright (c) 2006-2008 University of Houston. All rights reserved. * Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved * Copyright (c) 2011 Sandia National Laboratories. All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -558,7 +560,7 @@ void mca_pml_ob1_process_pending_packets(mca_bml_base_btl_t* bml_btl) pckt->hdr.hdr_ack.hdr_dst_req.pval, pckt->hdr.hdr_ack.hdr_send_offset, pckt->hdr.hdr_common.hdr_flags & MCA_PML_OB1_HDR_FLAGS_NORDMA); - if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) ) { + if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == rc) ) { OPAL_THREAD_LOCK(&mca_pml_ob1.lock); opal_list_append(&mca_pml_ob1.pckt_pending, (opal_list_item_t*)pckt); @@ -571,7 +573,7 @@ void mca_pml_ob1_process_pending_packets(mca_bml_base_btl_t* bml_btl) pckt->hdr.hdr_fin.hdr_des, pckt->order, pckt->hdr.hdr_fin.hdr_fail); - if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) ) { + if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == rc) ) { return; } break; @@ -603,7 +605,7 @@ void mca_pml_ob1_process_pending_rdma(void) } else { rc = mca_pml_ob1_recv_request_get_frag(frag); } - if(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) + if(OMPI_ERR_OUT_OF_RESOURCE == rc) break; } } diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.c b/ompi/mca/pml/ob1/pml_ob1_recvreq.c index e303d9e496..5beb4d1a2a 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvreq.c +++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.c @@ -12,6 +12,8 @@ * Copyright (c) 2008 UT-Battelle, LLC. All rights reserved. * Copyright (c) 2011 Sandia National Laboratories. All rights reserved. * Copyright (c) 2012 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -54,7 +56,7 @@ void mca_pml_ob1_recv_request_process_pending(void) break; recvreq->req_pending = false; rc = mca_pml_ob1_recv_request_schedule_exclusive(recvreq, NULL); - if(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) + if(OMPI_ERR_OUT_OF_RESOURCE == rc) break; } } @@ -391,7 +393,7 @@ int mca_pml_ob1_recv_request_get_frag( mca_pml_ob1_rdma_frag_t* frag ) /* queue up get request */ rc = mca_bml_base_get(bml_btl,descriptor); if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) { - if(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) { + if(OMPI_ERR_OUT_OF_RESOURCE == rc) { mca_bml_base_free(bml_btl, descriptor); OPAL_THREAD_LOCK(&mca_pml_ob1.lock); opal_list_append(&mca_pml_ob1.rdma_pending, diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.h b/ompi/mca/pml/ob1/pml_ob1_recvreq.h index 182e3d2d28..2646f2fd9b 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvreq.h +++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.h @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2008 UT-Battelle, LLC. All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -347,7 +349,7 @@ static inline int mca_pml_ob1_recv_request_schedule_exclusive( do { rc = mca_pml_ob1_recv_request_schedule_once(req, start_bml_btl); - if(OPAL_SOS_GET_ERROR_CODE(rc) == OMPI_ERR_OUT_OF_RESOURCE) + if(rc == OMPI_ERR_OUT_OF_RESOURCE) break; } while(!unlock_recv_request(req)); diff --git a/ompi/mca/pml/ob1/pml_ob1_sendreq.c b/ompi/mca/pml/ob1/pml_ob1_sendreq.c index 71bfa7c382..4c0b6d1cf1 100644 --- a/ompi/mca/pml/ob1/pml_ob1_sendreq.c +++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.c @@ -58,7 +58,7 @@ void mca_pml_ob1_send_request_process_pending(mca_bml_base_btl_t *bml_btl) switch(pending_type) { case MCA_PML_OB1_SEND_PENDING_SCHEDULE: rc = mca_pml_ob1_send_request_schedule_exclusive(sendreq); - if(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) { + if(OMPI_ERR_OUT_OF_RESOURCE == rc) { return; } break; @@ -71,7 +71,7 @@ void mca_pml_ob1_send_request_process_pending(mca_bml_base_btl_t *bml_btl) MCA_PML_OB1_SEND_PENDING_START, true); } else { rc = mca_pml_ob1_send_request_start_btl(sendreq, send_dst); - if (OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) { + if (OMPI_ERR_OUT_OF_RESOURCE == rc) { /* No more resources on this btl so prepend to the pending * list to minimize reordering and give up for now. */ add_request_to_send_pending(sendreq, @@ -550,7 +550,7 @@ int mca_pml_ob1_send_request_start_copy( mca_pml_ob1_send_request_t* sendreq, return OMPI_SUCCESS; } - if (OMPI_ERR_RESOURCE_BUSY == OPAL_SOS_GET_ERROR_CODE(rc)) { + if (OMPI_ERR_RESOURCE_BUSY == rc) { /* No more resources. Allow the upper level to queue the send */ rc = OMPI_ERR_OUT_OF_RESOURCE; } @@ -1192,7 +1192,7 @@ int mca_pml_ob1_send_request_put_frag( mca_pml_ob1_rdma_frag_t* frag ) if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) { mca_bml_base_free(bml_btl, des); frag->rdma_length = save_size; - if(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) { + if(OMPI_ERR_OUT_OF_RESOURCE == rc) { OPAL_THREAD_LOCK(&mca_pml_ob1.lock); opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag); OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock); diff --git a/ompi/mca/pml/ob1/pml_ob1_sendreq.h b/ompi/mca/pml/ob1/pml_ob1_sendreq.h index cae9570cf4..9ef7e818b8 100644 --- a/ompi/mca/pml/ob1/pml_ob1_sendreq.h +++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.h @@ -11,6 +11,8 @@ * All rights reserved. * Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2011-2012 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -285,7 +287,7 @@ mca_pml_ob1_send_request_schedule_exclusive(mca_pml_ob1_send_request_t* sendreq) int rc; do { rc = mca_pml_ob1_send_request_schedule_once(sendreq); - if(OPAL_SOS_GET_ERROR_CODE(rc) == OMPI_ERR_OUT_OF_RESOURCE) + if(rc == OMPI_ERR_OUT_OF_RESOURCE) break; } while(!unlock_send_request(sendreq)); @@ -444,7 +446,7 @@ mca_pml_ob1_send_request_start( mca_pml_ob1_send_request_t* sendreq ) /* select a btl */ bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager); rc = mca_pml_ob1_send_request_start_btl(sendreq, bml_btl); - if( OPAL_LIKELY(OMPI_ERR_OUT_OF_RESOURCE != OPAL_SOS_GET_ERROR_CODE(rc)) ) + if( OPAL_LIKELY(OMPI_ERR_OUT_OF_RESOURCE != rc) ) return rc; } add_request_to_send_pending(sendreq, MCA_PML_OB1_SEND_PENDING_START, true); diff --git a/ompi/mca/pubsub/base/pubsub_base_select.c b/ompi/mca/pubsub/base/pubsub_base_select.c index a69e1f17b7..301e0110d7 100644 --- a/ompi/mca/pubsub/base/pubsub_base_select.c +++ b/ompi/mca/pubsub/base/pubsub_base_select.c @@ -7,6 +7,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2012 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -18,7 +20,6 @@ #include "opal/mca/mca.h" #include "opal/mca/base/base.h" -#include "opal/util/opal_sos.h" #include "opal/mca/base/mca_base_param.h" #include "opal/mca/base/mca_base_component_repository.h" @@ -41,7 +42,7 @@ int ompi_pubsub_base_select(void) (mca_base_module_t **) &best_module, (mca_base_component_t **) &best_component))) { /* it is okay not to find any executable components */ - if (OMPI_ERR_NOT_FOUND == OPAL_SOS_GET_ERROR_CODE(ret)) { + if (OMPI_ERR_NOT_FOUND == ret) { ret = OPAL_SUCCESS; } goto cleanup; diff --git a/ompi/mca/pubsub/orte/pubsub_orte.c b/ompi/mca/pubsub/orte/pubsub_orte.c index a87cb4a0e0..4ad80f00f2 100644 --- a/ompi/mca/pubsub/orte/pubsub_orte.c +++ b/ompi/mca/pubsub/orte/pubsub_orte.c @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2012 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -26,7 +28,6 @@ #include "orte/util/show_help.h" #include "opal/util/argv.h" -#include "opal/util/opal_sos.h" #include "opal/dss/dss.h" #include "orte/mca/errmgr/errmgr.h" diff --git a/ompi/mca/rcache/rb/rcache_rb.c b/ompi/mca/rcache/rb/rcache_rb.c index 5087378300..535decad3a 100644 --- a/ompi/mca/rcache/rb/rcache_rb.c +++ b/ompi/mca/rcache/rb/rcache_rb.c @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -121,7 +123,7 @@ int mca_rcache_rb_insert ( if(flags & MCA_MPOOL_FLAGS_CACHE) { rc = mca_rcache_rb_mru_insert( (mca_rcache_rb_module_t*) rcache, reg); if(OMPI_SUCCESS != rc) { - if(OMPI_ERR_TEMP_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) { + if(OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc) { /* * If the registration is too big for the rcache, * don't cache it and reset the flags so the upper level diff --git a/ompi/mca/vprotocol/pessimist/vprotocol_pessimist_eventlog.c b/ompi/mca/vprotocol/pessimist/vprotocol_pessimist_eventlog.c index 515e0b305e..e6ccaf3678 100644 --- a/ompi/mca/vprotocol/pessimist/vprotocol_pessimist_eventlog.c +++ b/ompi/mca/vprotocol/pessimist/vprotocol_pessimist_eventlog.c @@ -1,6 +1,8 @@ /* * Copyright (c) 2004-2011 The Trustees of the University of Tennessee. * All rights reserved. + * Copyright (c) 2012 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -10,7 +12,6 @@ #include "ompi_config.h" #include "vprotocol_pessimist_eventlog.h" -#include "opal/util/opal_sos.h" #include "orte/mca/rml/rml.h" #include "orte/mca/rml/base/rml_contact.h" diff --git a/ompi/mpi/c/unpublish_name.c b/ompi/mpi/c/unpublish_name.c index 24a135545b..e7ecb782ea 100644 --- a/ompi/mpi/c/unpublish_name.c +++ b/ompi/mpi/c/unpublish_name.c @@ -9,6 +9,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2012 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -25,8 +27,6 @@ #include "ompi/info/info.h" #include "ompi/mca/pubsub/pubsub.h" -#include "opal/util/opal_sos.h" - #if OPAL_HAVE_WEAK_SYMBOLS && OMPI_PROFILING_DEFINES #pragma weak MPI_Unpublish_name = PMPI_Unpublish_name #endif @@ -68,13 +68,13 @@ int MPI_Unpublish_name(char *service_name, MPI_Info info, */ rc = ompi_pubsub.unpublish(service_name, info); if ( OMPI_SUCCESS != rc ) { - if (OMPI_ERR_NOT_FOUND == OPAL_SOS_GET_ERROR_CODE(rc)) { + if (OMPI_ERR_NOT_FOUND == rc) { /* service couldn't be found */ OPAL_CR_EXIT_LIBRARY(); return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_SERVICE, FUNC_NAME); } - if (OMPI_ERR_PERM == OPAL_SOS_GET_ERROR_CODE(rc)) { + if (OMPI_ERR_PERM == rc) { /* this process didn't own the specified service */ OPAL_CR_EXIT_LIBRARY(); return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_ACCESS, diff --git a/ompi/proc/proc.c b/ompi/proc/proc.c index 98b048a1d9..b4f8204a6e 100644 --- a/ompi/proc/proc.c +++ b/ompi/proc/proc.c @@ -10,6 +10,8 @@ * Copyright (c) 2004-2006 The Regents of the University of California. * All rights reserved. * Copyright (c) 2006-2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2012 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -26,7 +28,6 @@ #include "opal/threads/mutex.h" #include "opal/dss/dss.h" #include "opal/util/arch.h" -#include "opal/util/opal_sos.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/ess/ess.h" @@ -108,7 +109,6 @@ int ompi_proc_init(void) proc->proc_name.jobid = ORTE_PROC_MY_NAME->jobid; proc->proc_name.vpid = i; - ORTE_EPOCH_SET(proc->proc_name.epoch,ORTE_EPOCH_MIN); if (i == ORTE_PROC_MY_NAME->vpid) { ompi_proc_local_proc = proc; @@ -170,7 +170,7 @@ int ompi_proc_complete_init(void) break; #endif } - } else if (OMPI_ERR_NOT_IMPLEMENTED == OPAL_SOS_GET_ERROR_CODE(ret)) { + } else if (OMPI_ERR_NOT_IMPLEMENTED == ret) { proc->proc_arch = opal_local_arch; } else { errcode = ret; @@ -362,7 +362,6 @@ int ompi_proc_refresh(void) { /* Does not change: proc->proc_name.vpid */ proc->proc_name.jobid = ORTE_PROC_MY_NAME->jobid; - ORTE_EPOCH_SET(proc->proc_name.epoch,orte_ess.proc_get_epoch(&proc->proc_name)); /* Make sure to clear the local flag before we set it below */ proc->proc_flags = 0; diff --git a/ompi/runtime/ompi_mpi_finalize.c b/ompi/runtime/ompi_mpi_finalize.c index 3ec30856fc..976e69b66a 100644 --- a/ompi/runtime/ompi_mpi_finalize.c +++ b/ompi/runtime/ompi_mpi_finalize.c @@ -11,7 +11,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2006-2009 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2006 Los Alamos National Security, LLC. All rights + * Copyright (c) 2006-2011 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2006 University of Houston. All rights reserved. * Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved. @@ -87,6 +87,7 @@ #endif #include "ompi/runtime/ompi_cr.h" + int ompi_mpi_finalize(void) { int ret, value; @@ -94,6 +95,7 @@ int ompi_mpi_finalize(void) opal_list_item_t *item; struct timeval ompistart, ompistop; bool timing = false; + orte_grpcomm_collective_t *coll; /* Be a bit social if an erroneous program calls MPI_FINALIZE in two different threads, otherwise we may deadlock in @@ -229,11 +231,19 @@ int ompi_mpi_finalize(void) MPI barrier doesn't ensure that all messages have been transmitted before exiting, so the possibility of a stranded message exists. */ - if (ORTE_SUCCESS != (ret = orte_grpcomm.barrier())) { + coll = OBJ_NEW(orte_grpcomm_collective_t); + coll->id = orte_process_info.peer_fini_barrier; + if (ORTE_SUCCESS != (ret = orte_grpcomm.barrier(coll))) { ORTE_ERROR_LOG(ret); return ret; } + /* wait for barrier to complete */ + while (coll->active) { + opal_progress(); /* block in progress pending events */ + } + OBJ_RELEASE(coll); + /* check for timing request - get stop time and report elapsed time if so */ if (timing && 0 == ORTE_PROC_MY_NAME->vpid) { diff --git a/ompi/runtime/ompi_mpi_init.c b/ompi/runtime/ompi_mpi_init.c index 529f36eb47..438acdae8a 100644 --- a/ompi/runtime/ompi_mpi_init.c +++ b/ompi/runtime/ompi_mpi_init.c @@ -101,6 +101,7 @@ #include "ompi/runtime/ompi_cr.h" #include "orte/runtime/orte_globals.h" +#include "orte/util/name_fns.h" /* This is required for the boundaries of the hash tables used to store * the F90 types returned by the MPI_Type_create_f90_XXX functions. @@ -290,6 +291,7 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided) struct timeval ompistart, ompistop; char *event_val = NULL; bool orte_setup = false; + orte_grpcomm_collective_t *coll; /* bitflag of the thread level support provided. To be used * for the modex in order to work in heterogeneous environments. */ @@ -547,10 +549,20 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided) /* exchange connection info - this function also acts as a barrier * as it will not return until the exchange is complete */ - if (ORTE_SUCCESS != (ret = orte_grpcomm.modex(NULL))) { + coll = OBJ_NEW(orte_grpcomm_collective_t); + coll->id = orte_process_info.peer_modex; + if (ORTE_SUCCESS != (ret = orte_grpcomm.modex(coll))) { error = "orte_grpcomm_modex failed"; goto error; } + /* wait for modex to complete - this may be moved anywhere in mpi_init + * so long as it occurs prior to calling a function that needs + * the modex info! + */ + while (coll->active) { + opal_progress(); /* block in progress pending events */ + } + OBJ_RELEASE(coll); if (timing && 0 == ORTE_PROC_MY_NAME->vpid) { gettimeofday(&ompistop, NULL); @@ -897,7 +909,7 @@ MOVEON: /* If we got "unreachable", then print a specific error message. Otherwise, if we got some other failure, fall through to print a generic message. */ - if (OMPI_ERR_UNREACH == OPAL_SOS_GET_ERROR_CODE(ret)) { + if (OMPI_ERR_UNREACH == ret) { orte_show_help("help-mpi-runtime", "mpi_init:startup:pml-add-procs-fail", true); error = NULL; @@ -934,11 +946,18 @@ MOVEON: } /* wait for everyone to reach this point */ - if (ORTE_SUCCESS != (ret = orte_grpcomm.barrier())) { + coll = OBJ_NEW(orte_grpcomm_collective_t); + coll->id = orte_process_info.peer_init_barrier; + if (ORTE_SUCCESS != (ret = orte_grpcomm.barrier(coll))) { error = "orte_grpcomm_barrier failed"; goto error; } - + /* wait for barrier to complete */ + while (coll->active) { + opal_progress(); /* block in progress pending events */ + } + OBJ_RELEASE(coll); + /* check for timing request - get stop time and report elapsed time if so, then start the clock again */ if (timing && 0 == ORTE_PROC_MY_NAME->vpid) { diff --git a/ompi/tools/ompi-server/ompi-server.c b/ompi/tools/ompi-server/ompi-server.c index 735f71a1f5..0e0f997238 100644 --- a/ompi/tools/ompi-server/ompi-server.c +++ b/ompi/tools/ompi-server/ompi-server.c @@ -10,7 +10,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2007-2012 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2007 Los Alamos National Security, LLC. All rights + * Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ * @@ -46,7 +46,6 @@ #include "opal/mca/base/base.h" #include "opal/util/cmd_line.h" #include "opal/util/output.h" -#include "opal/util/opal_sos.h" #include "opal/util/show_help.h" #include "opal/util/daemon_init.h" #include "opal/runtime/opal.h" @@ -287,7 +286,9 @@ int main(int argc, char *argv[]) } /* wait to hear we are done */ - opal_event_dispatch(opal_event_base); + while (orte_event_base_active) { + opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE); + } /* should never get here, but if we do... */ diff --git a/ompi/tools/ompi_info/components.c b/ompi/tools/ompi_info/components.c index 49bb325c03..ad3cfe6822 100644 --- a/ompi/tools/ompi_info/components.c +++ b/ompi/tools/ompi_info/components.c @@ -101,6 +101,8 @@ #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/errmgr/base/base.h" +#include "orte/mca/state/state.h" +#include "orte/mca/state/base/base.h" #include "orte/mca/grpcomm/grpcomm.h" #include "orte/mca/grpcomm/base/base.h" #include "orte/mca/ess/ess.h" @@ -396,6 +398,14 @@ void ompi_info_open_components(void) */ orte_process_info.proc_type = ORTE_PROC_HNP; + if (ORTE_SUCCESS != orte_state_base_open()) { + goto error; + } + map = OBJ_NEW(ompi_info_component_map_t); + map->type = strdup("state"); + map->components = &orte_state_base_components_available; + opal_pointer_array_add(&component_map, map); + if (ORTE_SUCCESS != orte_errmgr_base_open()) { goto error; } @@ -789,7 +799,8 @@ void ompi_info_close_components() #endif (void) orte_errmgr_base_close(); - + (void) orte_state_base_close(); + (void) opal_backtrace_base_close(); (void) opal_memory_base_close(); (void) opal_memchecker_base_close(); diff --git a/ompi/tools/ompi_info/ompi_info.c b/ompi/tools/ompi_info/ompi_info.c index 9fac292498..2111d61a2a 100644 --- a/ompi/tools/ompi_info/ompi_info.c +++ b/ompi/tools/ompi_info/ompi_info.c @@ -268,6 +268,7 @@ int main(int argc, char *argv[]) opal_pointer_array_add(&mca_types, "filem"); #endif /* these are always included */ + opal_pointer_array_add(&mca_types, "state"); opal_pointer_array_add(&mca_types, "errmgr"); opal_pointer_array_add(&mca_types, "ess"); opal_pointer_array_add(&mca_types, "grpcomm"); diff --git a/opal/mca/base/mca_base_components_open.c b/opal/mca/base/mca_base_components_open.c index e5e5a1f7f8..b6743a7e61 100644 --- a/opal/mca/base/mca_base_components_open.c +++ b/opal/mca/base/mca_base_components_open.c @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2008 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -27,7 +29,6 @@ #include "opal/util/argv.h" #include "opal/util/output.h" #include "opal/util/show_help.h" -#include "opal/util/opal_sos.h" #include "opal/mca/mca.h" #include "opal/mca/base/base.h" #include "opal/mca/base/mca_base_component_repository.h" @@ -392,7 +393,7 @@ static int open_components(const char *type_name, int output_id, "mca: base: components_open: " "component %s register function successful", component->mca_component_name); - } else if (OPAL_ERR_NOT_AVAILABLE != OPAL_SOS_GET_ERROR_CODE(ret)) { + } else if (OPAL_ERR_NOT_AVAILABLE != ret) { /* If the component returns OPAL_ERR_NOT_AVAILABLE, it's a cue to "silently ignore me" -- it's not a failure, it's just a way for the component to say @@ -432,7 +433,7 @@ static int open_components(const char *type_name, int output_id, "mca: base: components_open: " "component %s open function successful", component->mca_component_name); - } else if (OPAL_ERR_NOT_AVAILABLE != OPAL_SOS_GET_ERROR_CODE(ret)) { + } else if (OPAL_ERR_NOT_AVAILABLE != ret) { /* If the component returns OPAL_ERR_NOT_AVAILABLE, it's a cue to "silently ignore me" -- it's not a failure, it's just a way for the component to say diff --git a/opal/mca/compress/base/compress_base_open.c b/opal/mca/compress/base/compress_base_open.c index 6324ab5887..53f0f69f6d 100644 --- a/opal/mca/compress/base/compress_base_open.c +++ b/opal/mca/compress/base/compress_base_open.c @@ -2,6 +2,8 @@ * Copyright (c) 2004-2010 The Trustees of Indiana University. * All rights reserved. * + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -18,7 +20,6 @@ #include "opal/mca/compress/compress.h" #include "opal/mca/compress/base/base.h" #include "opal/util/output.h" -#include "opal/util/opal_sos.h" #include "opal/mca/compress/base/static-components.h" @@ -84,7 +85,7 @@ int opal_compress_base_open(void) mca_compress_base_static_components, &opal_compress_base_components_available, true)) ) { - if( OPAL_ERR_NOT_FOUND == OPAL_SOS_GET_ERROR_CODE(ret) && + if( OPAL_ERR_NOT_FOUND == ret && NULL != str_value && 0 == strncmp(str_value, "none", strlen("none")) ) { exit_status = OPAL_SUCCESS; diff --git a/opal/mca/crs/base/crs_base_open.c b/opal/mca/crs/base/crs_base_open.c index b172fea33a..40bca363f3 100644 --- a/opal/mca/crs/base/crs_base_open.c +++ b/opal/mca/crs/base/crs_base_open.c @@ -8,6 +8,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2007 Evergrid, Inc. All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. * * $COPYRIGHT$ * @@ -28,7 +30,6 @@ #include "opal/mca/crs/crs.h" #include "opal/mca/crs/base/base.h" #include "opal/util/output.h" -#include "opal/util/opal_sos.h" #include "opal/mca/crs/base/static-components.h" @@ -95,7 +96,7 @@ int opal_crs_base_open(void) mca_crs_base_static_components, &opal_crs_base_components_available, true)) ) { - if( OPAL_ERR_NOT_FOUND == OPAL_SOS_GET_ERROR_CODE(ret) && + if( OPAL_ERR_NOT_FOUND == ret && NULL != str_value && 0 == strncmp(str_value, "none", strlen("none")) ) { exit_status = OPAL_SUCCESS; diff --git a/opal/mca/event/base/base.h b/opal/mca/event/base/base.h index 567f2e7a1c..1b87c95149 100644 --- a/opal/mca/event/base/base.h +++ b/opal/mca/event/base/base.h @@ -1,5 +1,7 @@ /* * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -12,6 +14,8 @@ #include "opal_config.h" +#include "opal/class/opal_pointer_array.h" + #include "opal/mca/event/event.h" /* @@ -95,7 +99,6 @@ OPAL_DECLSPEC int opal_event_base_close(void); OPAL_DECLSPEC extern int opal_event_base_output; OPAL_DECLSPEC extern opal_list_t opal_event_components; - END_C_DECLS #endif /* OPAL_BASE_EVENT_H */ diff --git a/opal/mca/event/base/event_base_close.c b/opal/mca/event/base/event_base_close.c index 1b8ae2f131..6ecb89f426 100644 --- a/opal/mca/event/base/event_base_close.c +++ b/opal/mca/event/base/event_base_close.c @@ -21,9 +21,6 @@ int opal_event_base_close(void) opal_event_base_inited--; - /* release the event base */ - opal_event_base_finalize(opal_event_base); - /* no need to close the component as it was statically opened */ /* for support of tools such as ompi_info */ diff --git a/opal/mca/event/base/event_base_open.c b/opal/mca/event/base/event_base_open.c index 2b3ea8683c..22d1c5ba03 100644 --- a/opal/mca/event/base/event_base_open.c +++ b/opal/mca/event/base/event_base_open.c @@ -79,7 +79,12 @@ int opal_event_base_open(void) /* get our event base */ if (NULL == (opal_event_base = opal_event_base_create())) { - rc = OPAL_ERROR; + return OPAL_ERROR; + } + + /* set the number of priorities */ + if (0 < OPAL_EVENT_NUM_PRI) { + opal_event_base_priority_init(opal_event_base, OPAL_EVENT_NUM_PRI); } return rc; diff --git a/opal/mca/event/event.h b/opal/mca/event/event.h index c251601940..bc12b482fd 100644 --- a/opal/mca/event/event.h +++ b/opal/mca/event/event.h @@ -25,6 +25,8 @@ #include #endif +#include "opal/class/opal_pointer_array.h" + #include "opal/mca/mca.h" #include "opal/mca/base/base.h" @@ -38,6 +40,17 @@ typedef unsigned char u_char; typedef unsigned short u_short; #endif +/* set the number of event priority levels */ +#define OPAL_EVENT_NUM_PRI 8 + +#define OPAL_EV_ERROR_PRI 0 +#define OPAL_EV_MSG_HI_PRI 1 +#define OPAL_EV_SYS_HI_PRI 2 +#define OPAL_EV_MSG_LO_PRI 3 +#define OPAL_EV_SYS_LO_PRI 4 +#define OPAL_EV_INFO_HI_PRI 5 +#define OPAL_EV_INFO_LO_PRI 6 +#define OPAL_EV_LOWEST_PRI 7 #define OPAL_EVENT_SIGNAL(ev) opal_event_get_signal(ev) diff --git a/opal/mca/event/libevent2013/configure.m4 b/opal/mca/event/libevent2013/configure.m4 index d955805f6a..063cf943d4 100644 --- a/opal/mca/event/libevent2013/configure.m4 +++ b/opal/mca/event/libevent2013/configure.m4 @@ -87,8 +87,8 @@ AC_DEFUN([MCA_opal_event_libevent2013_CONFIG],[ AC_ARG_ENABLE(event-debug, AC_HELP_STRING([--enable-event-debug], [enable event library debug output])) - if test "$enable_event_debug" = "no"; then - event_args="$event_args --disable-debug-mode" + if test "$enable_event_debug" = "yes"; then + event_args="$event_args --enable-debug-mode" fi AC_ARG_ENABLE(event-thread-support, diff --git a/opal/mca/event/libevent2013/libevent/event.c b/opal/mca/event/libevent2013/libevent/event.c index c0d36a9c5b..4ae06531d7 100644 --- a/opal/mca/event/libevent2013/libevent/event.c +++ b/opal/mca/event/libevent2013/libevent/event.c @@ -1519,9 +1519,6 @@ event_base_loop(struct event_base *base, int flags) * as we invoke user callbacks. */ EVBASE_ACQUIRE_LOCK(base, th_base_lock); - /**** OMPI CHANGE ****/ - /* Disable reentrant check */ -#if 0 if (base->running_loop) { event_warnx("%s: reentrant invocation. Only one event_base_loop" " can run on each event_base at once.", __func__); @@ -1530,8 +1527,6 @@ event_base_loop(struct event_base *base, int flags) } base->running_loop = 1; -#endif - /**** END OMPI CHANGE ****/ clear_time_cache(base); @@ -2148,14 +2143,8 @@ event_del(struct event *ev) int res; if (EVUTIL_FAILURE_CHECK(!ev->ev_base)) { - /**** OMPI CHANGE ****/ - /* Disable warning and return 0 */ - return 0; -#if 0 event_warnx("%s: event has no event_base set.", __func__); return -1; -#endif - /**** END OMPI CHANGE ****/ } EVBASE_ACQUIRE_LOCK(ev->ev_base, th_base_lock); diff --git a/opal/mca/event/libevent2013/libevent2013.h b/opal/mca/event/libevent2013/libevent2013.h index 43a0f5ab89..9f5a190ef4 100644 --- a/opal/mca/event/libevent2013/libevent2013.h +++ b/opal/mca/event/libevent2013/libevent2013.h @@ -1,6 +1,8 @@ /* * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 Los Alamos National Security, LLC. + * All rights reserved. * * $COPYRIGHT$ * @@ -57,49 +59,14 @@ #include "opal/mca/event/event.h" -typedef struct event opal_event_t; -/*** Overload the event_base_t struct ***/ -/* This may (hopefully) be a temporary change - * to deal with cross-base sync. Specifically, - * when an event in one base needs to release - * a condition_wait in another base, we need - * to "wakeup" the event base in the second base - * so the condition_wait can be checked - * - * On a more permanent level, use this to update - * the event base when it is being progressed in - * a separate thread. - */ -typedef struct { - struct event_base *base; - opal_event_t update_event; - int update_pipe[2]; -} opal_event_base_t; +typedef event_callback_fn opal_event_cbfunc_t; -typedef struct { - opal_event_t *ev; - uint8_t op; -} opal_event_update_t; - -#define OPAL_EVENT_NOOP 0x00 -#define OPAL_EVENT_ADD 0x01 -#define OPAL_EVENT_DEL 0x02 - -#if OPAL_EVENT_HAVE_THREAD_SUPPORT -#define OPAL_UPDATE_EVBASE(b, evt, ad) -#else -#define OPAL_UPDATE_EVBASE(b, evt, ad) \ - do { \ - opal_event_update_t up; \ - up.ev = (evt); \ - up.op = (ad); \ - opal_fd_write((b)->update_pipe[1], sizeof(opal_event_update_t), &up); \ - } while(0); -#endif BEGIN_C_DECLS -/* Temporary global - will be replaced by layer-specific event bases */ +typedef struct event_base opal_event_base_t; +typedef struct event opal_event_t; + OPAL_DECLSPEC extern opal_event_base_t *opal_event_base; #define OPAL_EV_TIMEOUT EV_TIMEOUT @@ -114,14 +81,19 @@ OPAL_DECLSPEC extern opal_event_base_t *opal_event_base; /* Global function to create and release an event base */ OPAL_DECLSPEC opal_event_base_t* opal_event_base_create(void); -OPAL_DECLSPEC void opal_event_base_finalize(opal_event_base_t *base); + +#define opal_event_base_free(x) event_base_free(x) OPAL_DECLSPEC int opal_event_init(void); -OPAL_DECLSPEC int opal_event_reinit(opal_event_base_t *base); +#define opal_event_reinit(b) event_reinit((b)) -OPAL_DECLSPEC struct timeval *opal_event_base_init_common_timeout (opal_event_base_t *evbase, - struct timeval *tv_in); +#define opal_event_base_init_common_timeout (b, t) event_base_init_common_timeout((b), (t)) + +/* Event priority APIs */ +#define opal_event_base_priority_init(b, n) event_base_priority_init((b), (n)) + +#define opal_event_set_priority(x, n) event_priority_set((x), (n)) /* thread support APIs */ #if OPAL_EVENT_HAVE_THREAD_SUPPORT @@ -135,9 +107,11 @@ OPAL_DECLSPEC struct timeval *opal_event_base_init_common_timeout (opal_event_ba #endif /* Basic event APIs */ +#define opal_event_enable_debug_mode() event_enable_debug_mode() + #define opal_event_set_debug_output(x) event_set_debug_output((x)) -#define opal_event_set(b, ev, fd, fg, cb, arg) event_assign((ev), (b)->base, (fd), (fg), (event_callback_fn) (cb), (arg)) +#define opal_event_set(b, x, fd, fg, cb, arg) event_assign((x), (b), (fd), (fg), (event_callback_fn) (cb), (arg)) #define opal_event_add(ev, tv) event_add((ev), (tv)) @@ -145,39 +119,39 @@ OPAL_DECLSPEC struct timeval *opal_event_base_init_common_timeout (opal_event_ba #define opal_event_active(x, y, z) event_active((x), (y), (z)) -#define opal_event_new(b, fd, fg, cb, arg) event_new((b)->base, (fd), (fg), (event_callback_fn) (cb), (arg)) +#define opal_event_new(b, fd, fg, cb, arg) event_new((b), (fd), (fg), (event_callback_fn) (cb), (arg)) + +OPAL_DECLSPEC opal_event_t* opal_event_alloc(void); #define opal_event_free(x) event_free((x)) /* Timer APIs */ -#define opal_event_evtimer_new(b, cb, arg) event_new((b)->base, -1, 0, (event_callback_fn) (cb), (arg)) +#define opal_event_evtimer_new(b, cb, arg) opal_event_new((b), -1, 0, (cb), (arg)) -#define opal_event_evtimer_add(ev, tv) event_add((ev), (tv)) +#define opal_event_evtimer_add(x, tv) opal_event_add((x), (tv)) -#define opal_event_evtimer_set(b, ev, cb, arg) event_assign((ev), (b)->base, -1, 0, (event_callback_fn) (cb), (arg)) +#define opal_event_evtimer_set(b, x, cb, arg) event_assign((x), (b), -1, 0, (event_callback_fn) (cb), (arg)) -#define opal_event_evtimer_del(ev) event_del((ev)) +#define opal_event_evtimer_del(x) opal_event_del((x)) -#define opal_event_evtimer_pending(ev, tv) event_pending((ev), EV_TIMEOUT, (tv)) +#define opal_event_evtimer_pending(x, tv) event_pending((x), EV_TIMEOUT, (tv)) -#define opal_event_evtimer_initialized(ev) event_initialized((ev)) +#define opal_event_evtimer_initialized(x) event_initialized((x)) /* Signal APIs */ -#define opal_event_signal_add(ev, tv) event_add((ev), (tv)) +#define opal_event_signal_add(x, tv) event_add((x), (tv)) -#define opal_event_signal_set(b, ev, fd, cb, arg) event_assign((ev), (b)->base, (fd), EV_SIGNAL|EV_PERSIST, (event_callback_fn) (cb), (arg)) +#define opal_event_signal_set(b, x, fd, cb, arg) event_assign((x), (b), (fd), EV_SIGNAL|EV_PERSIST, (event_callback_fn) (cb), (arg)) -#define opal_event_signal_del(ev) event_del((ev)) +#define opal_event_signal_del(x) event_del((x)) -#define opal_event_signal_pending(ev, tv) event_pending((ev), EV_SIGNAL, (tv)) +#define opal_event_signal_pending(x, tv) event_pending((x), EV_SIGNAL, (tv)) -#define opal_event_signal_initalized(ev) event_initialized((ev)) +#define opal_event_signal_initalized(x) event_initialized((x)) -#define opal_event_get_signal(ev) event_get_signal((ev)) +#define opal_event_get_signal(x) event_get_signal((x)) -#define opal_event_loop(b, fg) event_base_loop((b->base), (fg)) - -#define opal_event_dispatch(b) event_base_loop((b)->base, 0) +#define opal_event_loop(b, fg) event_base_loop((b), (fg)) END_C_DECLS diff --git a/opal/mca/event/libevent2013/libevent2013_module.c b/opal/mca/event/libevent2013/libevent2013_module.c index fbd8b707eb..305ff40f55 100644 --- a/opal/mca/event/libevent2013/libevent2013_module.c +++ b/opal/mca/event/libevent2013/libevent2013_module.c @@ -109,83 +109,16 @@ static const struct eventop *eventops[] = { static struct event_config *config=NULL; -static void update_event(int fd, short flags, void* arg) -{ - opal_event_update_t up; - - /* read the event */ - opal_fd_read(fd, sizeof(opal_event_update_t), &up); - if (NULL == up.ev) { - return; - } - if (OPAL_EVENT_ADD == up.op) { - event_add(up.ev, 0); - } else if (OPAL_EVENT_DEL == up.op) { - event_del(up.ev); - } - return; -} - -/* Public function -- not part of the module */ -/* This includes (hopefully) a temporary change - * to deal with cross-base sync. Specifically, - * when an event in one base needs to release - * a condition_wait in another base, we need - * to "wakeup" the event base in the second base - * so the condition_wait can be checked - */ opal_event_base_t* opal_event_base_create(void) { - struct event_base *base; - opal_event_base_t *evbase; + opal_event_base_t *base; base = event_base_new_with_config(config); if (NULL == base) { /* there is no backend method that does what we want */ opal_output(0, "No event method available"); - return NULL; } - evbase = (opal_event_base_t*)malloc(sizeof(opal_event_base_t)); - evbase->base = base; -#ifndef __WINDOWS__ - if (pipe(evbase->update_pipe) < 0) { - opal_output(0, "Unable to open update pipe"); - free(evbase); - event_base_free(base); - return NULL; - } -#else - if (create_socketpair(AF_UNIX, SOCK_STREAM, 0, evbase->update_pipe) == -1) { - opal_output(0, "Unable to open update socket"); - free(evbase); - event_base_free(base); - return NULL; - } -#endif - event_assign(&evbase->update_event, base, - evbase->update_pipe[0], EV_READ | EV_PERSIST, - update_event, NULL); - event_add(&evbase->update_event, 0); - return evbase; -} - -void opal_event_base_finalize(opal_event_base_t *evbase) -{ - /* delete the wakeup event */ - event_del(&evbase->update_event); -#ifndef __WINDOWS__ - /* close the pipe */ - close(evbase->update_pipe[0]); - close(evbase->update_pipe[1]); -#else - /* close the socket */ - closesocket(evbase->update_pipe[0]); - closesocket(evbase->update_pipe[1]); -#endif - /* release the base */ - event_base_free(evbase->base); - /* free the storage */ - free(evbase); + return base; } int opal_event_init(void) @@ -304,14 +237,10 @@ int opal_event_init(void) return OPAL_SUCCESS; } -int opal_event_reinit(opal_event_base_t *evbase) +opal_event_t* opal_event_alloc(void) { - return event_reinit(evbase->base); -} + opal_event_t *ev; -struct timeval *opal_event_base_init_common_timeout (opal_event_base_t *evbase, - struct timeval *tv_in) -{ - return (struct timeval*)event_base_init_common_timeout (evbase->base, tv_in); + ev = (opal_event_t*)malloc(sizeof(opal_event_t)); + return ev; } - diff --git a/opal/mca/hwloc/base/hwloc_base_util.c b/opal/mca/hwloc/base/hwloc_base_util.c index f828146b6d..bef3d48f3f 100644 --- a/opal/mca/hwloc/base/hwloc_base_util.c +++ b/opal/mca/hwloc/base/hwloc_base_util.c @@ -1456,5 +1456,6 @@ char* opal_hwloc_base_print_locality(opal_paffinity_locality_t locality) ptr->buffers[ptr->cntr][idx++] = 'K'; ptr->buffers[ptr->cntr][idx++] = '\0'; } + return ptr->buffers[ptr->cntr]; } diff --git a/opal/runtime/opal_finalize.c b/opal/runtime/opal_finalize.c index e3dad9ff71..5004c41202 100644 --- a/opal/runtime/opal_finalize.c +++ b/opal/runtime/opal_finalize.c @@ -10,7 +10,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2008 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2010-2011 Los Alamos National Security, LLC. + * Copyright (c) 2010-2012 Los Alamos National Security, LLC. * All rights reserved. * $COPYRIGHT$ * @@ -31,7 +31,6 @@ #include "opal/util/net.h" #include "opal/util/keyval_parse.h" #include "opal/util/show_help.h" -#include "opal/util/opal_sos.h" #include "opal/memoryhooks/memory.h" #include "opal/mca/base/base.h" #include "opal/runtime/opal.h" @@ -87,9 +86,6 @@ opal_finalize_util(void) /* finalize the trace system */ opal_trace_finalize(); - /* finalize the OPAL SOS system */ - opal_sos_finalize(); - /* finalize the show_help system */ opal_show_help_finalize(); diff --git a/opal/runtime/opal_init.c b/opal/runtime/opal_init.c index af9e3e67a7..427b794ba9 100644 --- a/opal/runtime/opal_init.c +++ b/opal/runtime/opal_init.c @@ -12,7 +12,7 @@ * Copyright (c) 2007-2012 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2009 Oak Ridge National Labs. All rights reserved. - * Copyright (c) 2010 Los Alamos National Security, LLC. + * Copyright (c) 2010-2012 Los Alamos National Security, LLC. * All rights reserved. * $COPYRIGHT$ * @@ -60,7 +60,6 @@ #include "opal/util/stacktrace.h" #include "opal/util/keyval_parse.h" #include "opal/util/sys_limits.h" -#include "opal/util/opal_sos.h" #if OPAL_CC_USE_PRAGMA_IDENT #pragma ident OPAL_IDENT_STRING @@ -78,7 +77,7 @@ opal_err2str(int errnum, const char **errmsg) { const char *retval; - switch (OPAL_SOS_GET_ERROR_CODE(errnum)) { + switch (errnum) { case OPAL_SUCCESS: retval = "Success"; break; @@ -255,9 +254,6 @@ opal_init_util(int* pargc, char*** pargv) /* initialize the help system */ opal_show_help_init(); - /* initialize the OPAL SOS system */ - opal_sos_init(); - /* register handler for errnum -> string converstion */ if (OPAL_SUCCESS != (ret = opal_error_register("OPAL", diff --git a/opal/util/Makefile.am b/opal/util/Makefile.am index 1ab987e853..3323b5813a 100644 --- a/opal/util/Makefile.am +++ b/opal/util/Makefile.am @@ -19,7 +19,7 @@ SUBDIRS = keyval -dist_pkgdata_DATA = help-opal-util.txt opal_sos_reporter.txt +dist_pkgdata_DATA = help-opal-util.txt AM_LFLAGS = -Popal_show_help_yy LEX_OUTPUT_ROOT = lex.opal_show_help_yy @@ -49,7 +49,6 @@ headers = \ opal_environ.h \ opal_getcwd.h \ opal_pty.h \ - opal_sos.h \ os_dirpath.h \ os_path.h \ output.h \ @@ -82,7 +81,6 @@ libopalutil_la_SOURCES = \ opal_environ.c \ opal_getcwd.c \ opal_pty.c \ - opal_sos.c \ os_dirpath.c \ os_path.c \ output.c \ diff --git a/opal/util/error.c b/opal/util/error.c index 50f76fa07b..fb948bfd74 100644 --- a/opal/util/error.c +++ b/opal/util/error.c @@ -9,7 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2007 Los Alamos National Security, LLC. + * Copyright (c) 2007-2012 Los Alamos National Security, LLC. * All rights reserved. * $COPYRIGHT$ * @@ -30,7 +30,6 @@ #endif #include "opal/util/error.h" -#include "opal/util/opal_sos.h" #include "opal/constants.h" #define MAX_CONVERTERS 5 @@ -99,12 +98,12 @@ opal_perror(int errnum, const char *msg) const char* errmsg; ret = opal_strerror_int(errnum, &errmsg); - if (NULL != msg && OPAL_SOS_GET_ERROR_CODE(errnum) != OPAL_ERR_IN_ERRNO) { + if (NULL != msg && errnum != OPAL_ERR_IN_ERRNO) { fprintf(stderr, "%s: ", msg); } if (OPAL_SUCCESS != ret) { - if (OPAL_SOS_GET_ERROR_CODE(errnum) == OPAL_ERR_IN_ERRNO) { + if (errnum == OPAL_ERR_IN_ERRNO) { perror(msg); } else { char *ue_msg; @@ -129,7 +128,7 @@ opal_strerror(int errnum) int ret; const char* errmsg; - if (OPAL_SOS_GET_ERROR_CODE(errnum) == OPAL_ERR_IN_ERRNO) { + if (errnum == OPAL_ERR_IN_ERRNO) { return strerror(errno); } @@ -156,7 +155,7 @@ opal_strerror_r(int errnum, char *strerrbuf, size_t buflen) ret = opal_strerror_int(errnum, &errmsg); if (OPAL_SUCCESS != ret) { - if (OPAL_SOS_GET_ERROR_CODE(errnum) == OPAL_ERR_IN_ERRNO) { + if (errnum == OPAL_ERR_IN_ERRNO) { char *tmp = strerror(errno); strncpy(strerrbuf, tmp, buflen); return OPAL_SUCCESS; diff --git a/opal/util/opal_sos.c b/opal/util/opal_sos.c deleted file mode 100644 index deadcf968d..0000000000 --- a/opal/util/opal_sos.c +++ /dev/null @@ -1,535 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2007 Los Alamos National Security, LLC. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "opal_config.h" - -#ifdef HAVE_STRING_H -#include -#endif -#include -#include -#ifdef HAVE_STDARG_H -#include -#endif -#ifdef HAVE_STDLIB_H -#include -#endif - -#include "opal/util/opal_sos.h" -#include "opal/constants.h" -#include "opal/mca/base/mca_base_param.h" -#include "opal/class/opal_hash_table.h" -#include "opal/util/stacktrace.h" -#include "opal/util/show_help.h" - -/** Global variables */ -opal_hash_table_t opal_sos_table; -opal_mutex_t opal_sos_table_lock; -bool opal_sos_print_low; - -/* Local variables */ -static bool opal_sos_initialized = false; -static const char *dash_line = "--------------------------------------------------------------------------"; -static const char *stackhdr = "[STACK TRACE]:\n"; - -/* Local functions */ -static void opal_sos_error_construct(opal_sos_error_t *obj); -static void opal_sos_error_destruct(opal_sos_error_t *obj); - -/** OPAL SOS callback function pointers */ -static opal_sos_print_callback_fn_t cur_print_callback; -static opal_sos_reporter_callback_fn_t cur_reporter_callback; -/* static opal_sos_print_callback_fn_t prev_print_callback; */ -static opal_sos_reporter_callback_fn_t prev_reporter_callback; - -OBJ_CLASS_INSTANCE(opal_sos_error_t, - opal_object_t, - opal_sos_error_construct, - opal_sos_error_destruct); - -/** - * Constructor - */ -static void opal_sos_error_construct(opal_sos_error_t *obj) -{ - obj->errnum = 0; - obj->file = NULL; - obj->line = 0; - obj->func = NULL; - obj->msg = NULL; - obj->prev = obj->next = OPAL_SOS_ERR_BASE; -} - -/** - * Destructor - */ -static void opal_sos_error_destruct(opal_sos_error_t *obj) -{ - if (NULL != obj->file) { - free(obj->file); - } - - if (NULL != obj->func) { - free(obj->func); - } - - if (NULL != obj->msg) { - free(obj->msg); - } -} - -/** - * Initialize the OPAL SOS interface - * - */ -void opal_sos_init(void) -{ - int value; - - if (opal_sos_initialized) { - return; - } - - mca_base_param_reg_int_name("opal", "sos_print_low", - "Set to non-zero to enable the print-at-bottom" - " preference for OPAL SOS. Enabling this option prints" - " out the errors, warnings or info messages as" - " soon as they are encountered.", - false, false, (int)false, &value); - - opal_sos_print_low = OPAL_INT_TO_BOOL(value); - - OBJ_CONSTRUCT(&opal_sos_table, opal_hash_table_t); - opal_hash_table_init(&opal_sos_table, OPAL_SOS_ERR_TABLE_SIZE); - OBJ_CONSTRUCT(&opal_sos_table_lock, opal_mutex_t); - - opal_sos_reg_reporter_callback(opal_sos_print_error, &prev_reporter_callback); - opal_sos_initialized = true; - return; -} - -/** - * Finalize the OPAL SOS interface - * - */ -void opal_sos_finalize(void) -{ - OBJ_DESTRUCT(&opal_sos_table); - OBJ_DESTRUCT(&opal_sos_table_lock); - opal_sos_initialized = false; - return; -} - -/** - * Free all the SOS errors represented by the error code pointed to by \c errnum - * - */ -void opal_sos_free(int *errnum) -{ - opal_sos_error_t *opal_error, *attached_error; - int err, attached_errnum; - - if (NULL == errnum) { - return; - } else if (true == OPAL_SOS_IS_NATIVE(*errnum)) { - return; - } else { - err = *errnum; - } - - *errnum = OPAL_SOS_GET_ERROR_CODE(err); - - do { - /* Look for attached errors */ - if (0 != (attached_errnum = OPAL_SOS_GET_ATTACHED_INDEX(err))) { - OPAL_THREAD_LOCK(&opal_sos_table_lock); - if (OPAL_SUCCESS != opal_hash_table_get_value_uint32(&opal_sos_table, - attached_errnum, - (void **)&attached_error)) { - goto cleanup; - } - OPAL_THREAD_UNLOCK(&opal_sos_table_lock); - - /* If there's an attached error trace, free it! */ - if (NULL != attached_error) { - attached_errnum = attached_error->errnum; - opal_sos_free(&attached_errnum); - } - } - - OPAL_THREAD_LOCK(&opal_sos_table_lock); - if (OPAL_SUCCESS != opal_hash_table_get_value_uint32(&opal_sos_table, - OPAL_SOS_GET_INDEX(err), - (void **)&opal_error)) { - goto cleanup; - } - OPAL_THREAD_UNLOCK(&opal_sos_table_lock); - if (NULL == opal_error) { - goto cleanup; - } - - opal_sos_error_destruct(opal_error); - /* Remove the entry from the SOS table */ - OPAL_THREAD_LOCK(&opal_sos_table_lock); - opal_hash_table_remove_value_uint32(&opal_sos_table, OPAL_SOS_GET_INDEX(err)); - OPAL_THREAD_UNLOCK(&opal_sos_table_lock); - - err = opal_error->prev; - } while (OPAL_SOS_ERR_BASE != err); - -cleanup: - OPAL_THREAD_UNLOCK(&opal_sos_table_lock); -} - -opal_sos_error_t * -opal_sos_build_error(int errnum, bool show_stack, const char *errmsg, ...) -{ - opal_sos_error_t *opal_error; - char *stackframe, msg[OPAL_SOS_MAX_ERR_LEN]; - va_list arglist; - int ret_errno = 0, len; - - if (!opal_sos_initialized) { - opal_sos_init(); - } - - opal_error = OBJ_NEW(opal_sos_error_t); - if (NULL == opal_error) { - return NULL; /* OPAL_ERR_OUT_OF_RESOURCE */ - } - - va_start(arglist, errmsg); - len = vsnprintf(msg, OPAL_SOS_MAX_ERR_LEN, errmsg, arglist); - va_end(arglist); -#if OPAL_WANT_PRETTY_PRINT_STACKTRACE - if ((true == show_stack) && - (NULL != (stackframe = opal_stackframe_output_string()))) { - len += strlen(stackhdr) + strlen(stackframe) + 2; - if (len > OPAL_SOS_MAX_ERR_LEN) - len = OPAL_SOS_MAX_ERR_LEN; - - opal_error->msg = (char *) malloc(len); - if (NULL == opal_error->msg) { - return NULL; - } - snprintf(opal_error->msg, len, "%s\n%s%s", msg, stackhdr, stackframe); - } else { - opal_error->msg = strdup(msg); - } -#else - opal_error->msg = strdup ("OPAL_WANT_PRETTY_PRINT_STACKTRACE disabled"); -#endif - - /* Check if errnum is a native error code and encode it into - the encoded error code if it is native */ - if (OPAL_SOS_IS_NATIVE(errnum)) { - OPAL_SOS_SET_ERROR_CODE(ret_errno, errnum); - } else { - /* Extract the native error code from the encoded error and - encode it back again into the newly encoded error code */ - OPAL_SOS_SET_ERROR_CODE(ret_errno, OPAL_SOS_GET_ERROR_CODE(errnum)); - opal_error->prev = errnum; - } - - opal_error->errnum = ret_errno; - return opal_error; -} - -int opal_sos_reporter(const char *file, int line, const char *func, - opal_sos_severity_t severity, opal_sos_error_t *opal_error) -{ - opal_sos_error_t *prev_error; - int ret_errno = 0, hash; - - if (NULL == opal_error) { - return OPAL_ERR_OUT_OF_RESOURCE; - } - - /* Doing more strict validation here since if either of the file, - * func or msg are not known we replace it by to avoid any issues - * during dss pack/unpack - */ - opal_error->file = (NULL != file)?strdup(file):strdup(""); - opal_error->func = (NULL != func)?strdup(func):strdup(""); - opal_error->line = line; - - ret_errno = opal_error->errnum; - /* Encode the severity level into the return error code */ - OPAL_SOS_SET_SEVERITY(ret_errno, severity); - hash = opal_sos_hash_error(opal_error); - OPAL_SOS_SET_INDEX(ret_errno, hash); - opal_error->errnum = ret_errno; - - if (opal_sos_print_low) { - opal_sos_report_error(opal_error); - } - - /* Add the error object to the error table */ - OPAL_THREAD_LOCK(&opal_sos_table_lock); - - if (OPAL_SUCCESS != - opal_hash_table_set_value_uint32(&opal_sos_table, - OPAL_SOS_GET_INDEX(ret_errno), - (void *)opal_error)) { - OPAL_THREAD_UNLOCK(&opal_sos_table_lock); - OBJ_DESTRUCT(opal_error); - return OPAL_ERROR; - } - - /* Get the previous error in the error call stack and update - its next error pointer */ - prev_error = NULL; - opal_hash_table_get_value_uint32(&opal_sos_table, - OPAL_SOS_GET_INDEX(opal_error->prev), - (void **)&prev_error); - if (NULL != prev_error) { - prev_error->next = opal_error->errnum; - } - OPAL_THREAD_UNLOCK(&opal_sos_table_lock); - - return ret_errno; -} - -void -opal_sos_report_error(opal_sos_error_t *error) -{ - opal_sos_severity_t severity; - char *pretty_error; - int errnum, ret; - - if (NULL == error) - return; - - severity = (opal_sos_severity_t)OPAL_SOS_GET_SEVERITY(error->errnum); - - /* An OPAL SOS encoded error number holds no meaning outside - * the context of Open MPI. We convert it back to the native - * error code before reporting it. */ - if (true == OPAL_SOS_IS_NATIVE(error->errnum)) { - errnum = error->errnum; - } else { - errnum = OPAL_SOS_GET_ERROR_CODE(error->errnum); - } - - /* Prettify the error for printing it locally */ - ret = opal_sos_prettify_error(error->msg, &pretty_error); - - (*cur_reporter_callback)(severity, errnum, "<%s> at %s:%d:%s():\n%s", - opal_sos_severity2str(severity), error->file, - error->line, error->func, - ((0 > ret) ? error->msg : pretty_error)); - - if (ret > 0) { - free(pretty_error); - } - - /* Call the previous reporter callback which should be the selected - * ORTE notifier components */ - if (NULL != prev_reporter_callback) { - prev_reporter_callback(severity, errnum, "<%s> at %s:%d:%s():\n%s", - opal_sos_severity2str(severity), error->file, - error->line, error->func, error->msg); - } -} - -void opal_sos_print(int errnum, bool show_history) -{ - opal_sos_error_t *opal_error, *prev_opal_error, *attached_error; - int tmp, attached_errnum, prev_severity, severity; - - opal_show_help("opal_sos_reporter.txt", "msg header", false, dash_line); - tmp = errnum; - prev_opal_error = NULL; - do { - /* If there is an error attached to this error, print it out. */ - if (0 != (attached_errnum = OPAL_SOS_GET_ATTACHED_INDEX(errnum))) { - OPAL_THREAD_LOCK(&opal_sos_table_lock); - if (OPAL_SUCCESS != opal_hash_table_get_value_uint32(&opal_sos_table, - attached_errnum, - (void **)&attached_error)) { - goto cleanup; - } - OPAL_THREAD_UNLOCK(&opal_sos_table_lock); - - if (NULL != attached_error) { - opal_sos_print(attached_error->errnum, show_history); - } - } - - OPAL_THREAD_LOCK(&opal_sos_table_lock); - if (OPAL_SUCCESS != - opal_hash_table_get_value_uint32(&opal_sos_table, - OPAL_SOS_GET_INDEX(errnum), - (void **)&opal_error)) { - goto cleanup; - } - OPAL_THREAD_UNLOCK(&opal_sos_table_lock); - if (NULL == opal_error) { - return; - } - - if (NULL != prev_opal_error) { - prev_severity = OPAL_SOS_GET_SEVERITY(prev_opal_error->errnum); - severity = OPAL_SOS_GET_SEVERITY(errnum); - - /* If show_history is enabled, or if the preceeding error - was of higher severity, then report the error */ - if (show_history || (prev_severity <= severity)) - /* Print the error denoted by errnum. */ - opal_sos_report_error(prev_opal_error); - } - - prev_opal_error = opal_error; - /* Get the previous error */ - errnum = opal_error->prev; - /* Terminating condition */ - if (OPAL_SOS_ERR_BASE == errnum) { - opal_sos_report_error(opal_error); - } - } while (errnum != OPAL_SOS_ERR_BASE); - opal_show_help("opal_sos_reporter.txt", "msg header", false, dash_line); - errnum = tmp; - return; - -cleanup: - OPAL_THREAD_UNLOCK(&opal_sos_table_lock); -} - -void opal_sos_print_error(opal_sos_severity_t severity, int errnum, const char *errmsg, ...) -{ - va_list arglist; - va_start(arglist, errmsg); - opal_show_vhelp("opal_sos_reporter.txt", "general message", false, arglist); - va_end(arglist); -} - -void opal_sos_log(int errnum) -{ - opal_sos_print(errnum, false); - opal_sos_free(&errnum); -} - -int opal_sos_prettify_error(const char *error, char **pretty_error) -{ - char *str, *token, *saveptr, *errdup; - const char *prefix = "\n| | "; - int len = 0, plen, left; - - if (NULL == error) { - return OPAL_ERROR; - } - - *pretty_error = (char *) malloc(OPAL_SOS_MAX_ERR_LEN); - if (NULL == *pretty_error) { - return OPAL_ERR_OUT_OF_RESOURCE; - } - *(*pretty_error) = '\0'; - - plen = strlen(prefix); - - if (NULL != (errdup = strdup(error))) { - for (str = errdup, len = 0; len < OPAL_SOS_MAX_ERR_LEN; str = NULL) { - if (NULL == (token = strtok_r(str, "\n", &saveptr))) { - break; - } - - left = strlen(token); - if ((len + left) > OPAL_SOS_MAX_ERR_LEN) { - left = OPAL_SOS_MAX_ERR_LEN - len; - } - strncat(*pretty_error, token, left); - len += left; - - left = plen; - if ((len + left) > OPAL_SOS_MAX_ERR_LEN) { - left = OPAL_SOS_MAX_ERR_LEN - len; - } - strncat(*pretty_error, prefix, left); - len += left; - } - free(errdup); - errdup = NULL; - } - - return len; -} - -const char *opal_sos_severity2str(opal_sos_severity_t severity) -{ - switch(severity) { - case OPAL_SOS_SEVERITY_EMERG: return "EMERGENCY"; - case OPAL_SOS_SEVERITY_ALERT: return "ALERT MESSAGE"; - case OPAL_SOS_SEVERITY_CRIT: return "CRITICAL MESSAGE"; - case OPAL_SOS_SEVERITY_ERROR: return "ERROR"; - case OPAL_SOS_SEVERITY_WARN: return "WARNING"; - case OPAL_SOS_SEVERITY_NOTICE: return "NOTICE"; - case OPAL_SOS_SEVERITY_INFO: return "INFO MESSAGE"; - case OPAL_SOS_SEVERITY_DEBUG: return "DEBUG MESSAGE"; - default: return "UNKNOWN ERROR"; - } -} - -int opal_sos_hash_error(opal_sos_error_t *error) -{ - int hash, c; - char *msg; - - /* Naive string hash function to create a key based on the error - details, namely length of the file name, length of the function - name and the sum of the characters in the error message */ - - hash = error->errnum; - if (NULL != error->file) { - hash += strlen(error->file); - } - if (NULL != error->func) { - hash += strlen(error->func); - } - if (NULL != error->msg) { - msg = error->msg; - while ('\0' != (c = *msg++)) { - hash += c; - } - } - - return (hash & (OPAL_SOS_ERR_TABLE_SIZE - 1)); -} - -int opal_sos_reg_print_callback(opal_sos_print_callback_fn_t new_func, - opal_sos_print_callback_fn_t *prev_func) -{ - /* Preserve the previous print callback */ - *prev_func = cur_print_callback; - - /* Update the current print callback */ - cur_print_callback = new_func; - return OPAL_SUCCESS; -} - -int opal_sos_reg_reporter_callback(opal_sos_reporter_callback_fn_t new_func, - opal_sos_reporter_callback_fn_t *prev_func) -{ - /* Preserve the previous reporter callback */ - *prev_func = cur_reporter_callback; - - /* Update the current reporter callback */ - cur_reporter_callback = new_func; - return OPAL_SUCCESS; -} diff --git a/opal/util/opal_sos.h b/opal/util/opal_sos.h deleted file mode 100644 index 22df2395c4..0000000000 --- a/opal/util/opal_sos.h +++ /dev/null @@ -1,441 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2006 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef OPAL_SOS_H -#define OPAL_SOS_H - -#ifdef HAVE_LIMITS_H -#include -#endif -#ifdef HAVE_SYSLOG_H -#include -#endif - -#include "opal/class/opal_object.h" -#include "opal/class/opal_hash_table.h" -#include "opal/threads/mutex.h" -#include "opal/util/output.h" - -#ifdef __STDC_VERSION__ -# if __STDC_VERSION__ < 199901L -# if defined(__GNUC__) && __GNUC__ >= 2 -# define OPAL_SOS_FUNCTION __FUNCTION__ -# else -# define OPAL_SOS_FUNCTION "" -# endif -# else -# define OPAL_SOS_FUNCTION __func__ -# endif -#else -# define OPAL_SOS_FUNCTION __func__ -#endif - -/* Internal use only */ -#define OPAL_SOS_ERR_BASE OPAL_SUCCESS - -/** - * Size of the OPAL SOS error table. - * - * Since the index into the error table that is encoded in the error - * code is 9-bit long, setting a higher value than (1 << 9) would make - * no difference at all. - */ -#define OPAL_SOS_ERR_TABLE_SIZE 512 - -/** - * Maximum length for the error string stored per error code in the - * OPAL SOS error table. - */ -#define OPAL_SOS_MAX_ERR_LEN 1024 - -/** - * Reports an error to OPAL SOS reporter. - * - * Encodes an informational message with severity \c severity and - * other passed arguments like errnum, errmsg etc. It also remembers - * the line number, file name and the function name where the error - * has occurred. - * If the MCA parameter \c opal_sos_print_low is set, the error message - * is displayed on stderr using the "show help" subsystem. By default, - * informational messages are not printed out on stderr. - * If \c show_stack is set, the stacktrace is saved and/or printed - * along with the corresponding \c errmsg. - */ -#define OPAL_SOS_REPORT(severity, arg) opal_sos_reporter(__FILE__, __LINE__, \ - OPAL_SOS_FUNCTION, \ - severity, \ - opal_sos_build_error arg) - -/** - * Print or store an event with the maximum severity (EMERG). - */ -#define OPAL_SOS_EMERG(arg) OPAL_SOS_REPORT(OPAL_SOS_SEVERITY_EMERG, arg) - -/** - * Report an event of severity "ALERT". - */ -#define OPAL_SOS_ALERT(arg) OPAL_SOS_REPORT(OPAL_SOS_SEVERITY_ALERT, arg) - -/** - * Report events with severity marked as "CRITICAL". - */ -#define OPAL_SOS_CRIT(arg) OPAL_SOS_REPORT(OPAL_SOS_SEVERITY_CRIT, arg) - -/** - * Prints and/or logs an error. - * This function can be used to log or print error events. - */ -#define OPAL_SOS_ERROR(arg) OPAL_SOS_REPORT(OPAL_SOS_SEVERITY_ERROR, arg) - -/** - * Prints and/or logs a warning. - * - * This function is similar to OPAL_SOS_INFO but with a higher - * severity. These events are printed out on the output stream - * by default. - */ -#define OPAL_SOS_WARN(arg) OPAL_SOS_REPORT(OPAL_SOS_SEVERITY_WARN, arg) - -/** - * Report an error event with severity "NOTICE". - */ -#define OPAL_SOS_NOTICE(arg) OPAL_SOS_REPORT(OPAL_SOS_SEVERITY_NOTICE,arg) - -/** - * Prints or logs an informational message in the OPAL SOS framework. - * Events with this severity are not printed, by default. However, - * they are still stored in the SOS table. - */ -#define OPAL_SOS_INFO(arg) OPAL_SOS_REPORT(OPAL_SOS_SEVERITY_INFO, arg) - -/** - * Log debug events in the SOS framework. - */ -#define OPAL_SOS_DEBUG(arg) OPAL_SOS_REPORT(OPAL_SOS_SEVERITY_DEBUG, arg) - -/** - * Frees all the (entire stack of) OPAL SOS error objects associated - * with the encoded error code obtained after dereferencing the - * pointer \c errnum. - */ -#define OPAL_SOS_FREE(perrnum) opal_sos_free(perrnum) - -/** - * Print the warnings/errors/informational messages previously logged - * in to the SOS framework. - * - * This function prints the error details encoded by \c errnum. - * If \c show_history is true, the entire history for the error - * represented by \c errnum is printed on the output stream. - */ -#define OPAL_SOS_PRINT(errnum, show_history) \ - opal_sos_print(errnum, show_history) - -/** - * Attach the history from one error code to another error code - * Returns the target encoded error \c errtgt with history of \c - * errnum associated to it. - */ -#define OPAL_SOS_ATTACH(errtgt, errnum) \ - (errtgt = -((-errtgt & ~0xFF80000L) | \ - ((OPAL_SOS_GET_INDEX(errnum) & 0x1FFL) * 0x80000L))) - -/** - * Returns the index of the error attached to errnum using OPAL_SOS_ATTACH(). - */ -#define OPAL_SOS_GET_ATTACHED_INDEX(errnum) ((int) ((-errnum & 0xFF80000L) >> 19)) - -/** - * Returns the native error code for the given encoded error code \c - * errnum. \c errnum can be a native error code itself. - */ -#define OPAL_SOS_GET_ERROR_CODE(errnum) \ - ((errnum >= 0) ? errnum : (int) -(-errnum & 0x3FFL)) - -/** - * Sets the native error code for the potentially encoded error code. - * - * The lower 10 bits are reserved for the native error code. This - * macro sets the lower 10 bits of errnum to nativeerr. - */ -#define OPAL_SOS_SET_ERROR_CODE(errnum, nativeerr) \ - (errnum = -((-errnum & ~0x3FFL) | (-nativeerr & 0x3FFL))) - -/** - * Macro to check if the error encoded by \c errnum is a native error - * or an OPAL SOS encoded error. - */ -#define OPAL_SOS_IS_NATIVE(errnum) ((-errnum & ~0x3FFL) == 0) - -/** - * Returns the severity level for the potentially encoded error code. - * - * The severity is encoded in the last three bits of the first nibble. - */ -#define OPAL_SOS_GET_SEVERITY(errnum) ((int)((-errnum >> 28) & 0x7L)) - -/** - * Sets the severity level for the given error code \c errnum. - * - * This macros do not do strict error checking of the specified - * severity levels. - */ -#define OPAL_SOS_SET_SEVERITY(errnum, severity) \ - (errnum = -((-errnum & ~0x70000000L) | ((severity & 0x7L) * 0x10000000L))) - -/** - * Macro to get the encoded error severity level as a string. - * - * This macro accepts the argument \c severity and calls the corresponding - * function opal_sos_severity2str to convert it to a string. The result - * is returned in a static buffer that should not be freed with free(). - */ -#define OPAL_SOS_SEVERITY2STR(severity) opal_sos_severity2str(severity) - -/** - * Log an encoded error \c errnum. - * - * This macro prints out and consequently frees the entire stack of - * errors associated with the \c errnum. - */ -#define OPAL_SOS_LOG(errnum) opal_sos_log(errnum) - -/** - * \internal - * Returns the index into the error table of the error encoded by \c errnum. - * - * The index is 9-bit long stored from bit 11 to bit 20 in the encoded - * error code. - */ -#define OPAL_SOS_GET_INDEX(errnum) ((int)((-errnum & 0x7FC00L) >> 10)) - -/** - * \internal - * Sets the index into the error table for the error encoded by \c errnum. - */ -#define OPAL_SOS_SET_INDEX(errnum, index) \ - (errnum = -((-errnum & ~0x7FC00L) | ((index & 0x1FFL) * 0x400L))) - -BEGIN_C_DECLS - -/** This MCA parameter sos_print_low can be set to non-zero to enable - * the print-at-bottom preference for OPAL SOS. */ -OPAL_DECLSPEC extern bool opal_sos_print_low; - -/* Severity levels for OPAL SOS */ -typedef enum { - OPAL_SOS_SEVERITY_EMERG = LOG_EMERG, - OPAL_SOS_SEVERITY_ALERT = LOG_ALERT, - OPAL_SOS_SEVERITY_CRIT = LOG_CRIT, - OPAL_SOS_SEVERITY_ERROR = LOG_ERR, - OPAL_SOS_SEVERITY_WARN = LOG_WARNING, - OPAL_SOS_SEVERITY_NOTICE = LOG_NOTICE, - OPAL_SOS_SEVERITY_INFO = LOG_INFO, - OPAL_SOS_SEVERITY_DEBUG = LOG_DEBUG -} opal_sos_severity_t; - -typedef struct opal_sos_error_t { - /** Class parent */ - opal_object_t super; - - /** - * The encoded error code for a given type of error. - * - * errnum encodes a native error code (lower 10 bits) with the - * current severity (higher 2 bits) and an index into the error - * table along with the associated error, if there is one. - */ - int errnum; - - /** File in which the error occured */ - char *file; - - /** Line number on which the error was encountered */ - int line; - - /** This is an optional parameter that indicates the function in - which the error occured */ - char *func; - - /** The actual error message or string for the error indicated by - \c errnum */ - char *msg; - - /** Encoded error numbers of the previous and the next error. - These are used are used to maintain the history of an error. - The complete history of an error can be printed later using - OPAL_SOS_PRINT() */ - int prev; - int next; -} opal_sos_error_t; - -OPAL_DECLSPEC OBJ_CLASS_DECLARATION(opal_sos_error_t); - -/** - * Signature for OPAL SOS print function callback type. - */ -typedef void (*opal_sos_print_callback_fn_t) (int errcode); - -/** - * Signature for OPAL SOS reporter function callback type. - */ -typedef void (*opal_sos_reporter_callback_fn_t) (opal_sos_severity_t severity, int errcode, - const char *msg, ...) - __opal_attribute_format_funcptr__(__printf__, 3, 4); - -/** - * A global handle that points to the local OPAL SOS table. - * This is used by the notifier components to reference the local OPAL - * SOS table, especially for packing/unpacking and sending it over to - * the HNP. - */ -OPAL_DECLSPEC extern opal_hash_table_t opal_sos_table; - -/** - * A global handle that points to the OPAL SOS table lock. - * - */ -OPAL_DECLSPEC extern opal_mutex_t opal_sos_table_lock; - -/** - * \internal - * - * Initialize OPAL SOS. - * - * This function initializes and sets up the structures required to - * track the data handled by OPAL SOS. It is invoked by - * opal_util(). - */ -void opal_sos_init(void); - -/** - * \internal - * - * Shut down OPAL SOS. - * - * Invoked by opal_finalize() to deallocate the structures needed by - * OPAL SOS. - */ -void opal_sos_finalize(void); - -/** - * Prints or relays the error locally or using the selected notifier - * components. - */ -void -opal_sos_report_error(opal_sos_error_t *error); - -/** - * Builds an OPAL SOS error object given the parameters errnum, - * show_stack and errmsg. - * NOTE: This function only partially populates the SOS error object - * structure, setting the error message details but nothing about where - * the error occurred. Filling up the rest of the error object is left - * to OPAL SOS reporter which then handles the error appropriately. - * - * @param errnum - * @param show_stack - * @param errmsg - * - * @return - */ -OPAL_DECLSPEC opal_sos_error_t * -opal_sos_build_error(int errnum, bool show_stack, - const char *errmsg, ...) - __opal_attribute_format_funcptr__(__printf__, 3, 4); - -/** - * OPAL SOS reporter logs the error in the OPAL SOS error table or - * prints it out depending on the associated reporter callback. It can - * also relay the error messages to the selected notifier components - * using the OPAL SOS reporter callback interface. - * - * @param file - * @param line - * @param func - * @param opal_error - * - * @return encoded error code - */ -OPAL_DECLSPEC int opal_sos_reporter(const char *file, int line, const char *func, - opal_sos_severity_t severity, - opal_sos_error_t *opal_error); - -/** - * Prints the error encoded by the error number \c errnum - * - * @param errnum - * @param show_history - * - */ -OPAL_DECLSPEC void opal_sos_print(int errnum, bool show_history); - -OPAL_DECLSPEC int opal_sos_prettify_error(const char *error, char **pretty_error); - -/** - * Prints a single error represented by the OPAL SOS error object - * opal_sos_error_t. - */ -OPAL_DECLSPEC void opal_sos_print_error(opal_sos_severity_t severity, - int errnum, const char *errmsg, ...) - __opal_attribute_format_funcptr__(__printf__, 3, 4); - -/** - * Frees the error object represented by the error code \c errnum. - */ -OPAL_DECLSPEC void opal_sos_free(int *errnum); - -/** - * Logs (prints and frees) the error object represented by \c errnum. - */ -OPAL_DECLSPEC void opal_sos_log(int errnum); - -/** - * Returns the OPAL SOS severity level as a string. - * - */ -const char *opal_sos_severity2str(opal_sos_severity_t severity); - -/** - * \internal - * Return a unique key into the hash table (opal_sos_error_table) - * depending on the type and location of the error. - * - */ -int opal_sos_hash_error(opal_sos_error_t *error); - -/** - * Registers a print callback function for OPAL_SOS_PRINT() - */ -OPAL_DECLSPEC int -opal_sos_reg_print_callback(opal_sos_print_callback_fn_t new_func, - opal_sos_print_callback_fn_t *prev_func); - -/** - * Registers a reporter callback function for OPAL_SOS_INFO(), - * OPAL_SOS_WARN() and OPAL_SOS_ERROR() - */ -OPAL_DECLSPEC int -opal_sos_reg_reporter_callback(opal_sos_reporter_callback_fn_t new_func, - opal_sos_reporter_callback_fn_t *prev_func); - -END_C_DECLS - -#endif /* OPAL_SOS_H */ diff --git a/opal/util/stacktrace.c b/opal/util/stacktrace.c index 502e200933..25f899f649 100644 --- a/opal/util/stacktrace.c +++ b/opal/util/stacktrace.c @@ -519,9 +519,7 @@ int opal_util_register_stackhandlers (void) if (!showed_help && complain) { /* JMS This is icky; there is no error message aggregation here so this message may be repeated for - every single MPI process... This should be replaced - with OPAL_SOS when that is done so that it can be - properly aggregated. */ + every single MPI process... */ opal_show_help("help-opal-util.txt", "stacktrace signal override", true, sig, sig, sig, string_value); diff --git a/orte/Makefile.am b/orte/Makefile.am index cf4decea57..13027fc9e0 100644 --- a/orte/Makefile.am +++ b/orte/Makefile.am @@ -63,7 +63,6 @@ include tools/Makefile.am include orted/Makefile.am include test/mpi/Makefile.include include test/system/Makefile.include -include threads/Makefile.am # Set the convenience library to be the same as the non-convenience # library, but a) it's marked as "noinst", so LT knows it's a diff --git a/orte/config/orte_configure_options.m4 b/orte/config/orte_configure_options.m4 index eb889353a8..72ebe798a9 100644 --- a/orte/config/orte_configure_options.m4 +++ b/orte/config/orte_configure_options.m4 @@ -13,7 +13,7 @@ dnl All rights reserved. dnl Copyright (c) 2006-2010 Cisco Systems, Inc. All rights reserved. dnl Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved. dnl Copyright (c) 2009 IBM Corporation. All rights reserved. -dnl Copyright (c) 2009 Los Alamos National Security, LLC. All rights +dnl Copyright (c) 2009-2012 Los Alamos National Security, LLC. All rights dnl reserved. dnl Copyright (c) 2009 Oak Ridge National Labs. All rights reserved. dnl @@ -114,25 +114,22 @@ AC_DEFINE_UNQUOTED([ORTE_ENABLE_HEARTBEAT], [Whether we want daemon heartbeat monitoring enabled]) # -# Compile in resilient runtime code -# -AC_MSG_CHECKING([if want resilient runtime code enabled]) -AC_ARG_ENABLE(resilient-orte, - [AC_HELP_STRING([--enable-resilient-orte], [Enable the resilient runtime code.])]) -if test "$enable_resilient_orte" = "yes"; then +# Do we want a separate orte progress thread? +AC_MSG_CHECKING([if want orte progress thread]) +AC_ARG_ENABLE([orte-progress-thread], + [AC_HELP_STRING([--enable-orte-progress-thread], + [Enable orte progress thread - for experiment by developers only! (default: disabled)])]) +if test "$enable_orte_progress_thread" = "yes"; then AC_MSG_RESULT([yes]) - orte_enable_resilient_code=1 + orte_enable_progress_thread=1 + AC_DEFINE_UNQUOTED(OPAL_EVENT_HAVE_THREAD_SUPPORT, 1, + [Thread support must be configured into the event library]) else AC_MSG_RESULT([no]) - orte_enable_resilient_code=0 + orte_enable_progress_thread=0 fi -AM_CONDITIONAL(ORTE_RESIL_ORTE, [test "$enable_resilient_orte" = "yes"]) -AC_DEFINE_UNQUOTED([ORTE_RESIL_ORTE], [$orte_enable_resilient_code], - [Compile a resilient version of Open MPI]) - -AM_CONDITIONAL(ORTE_ENABLE_EPOCH, [test "$enable_resilient_orte" = "yes"]) -AC_DEFINE_UNQUOTED([ORTE_ENABLE_EPOCH], [$orte_enable_resilient_code], - [Support for epoch in the ORTE process name enabled or not]) - +AC_DEFINE_UNQUOTED([ORTE_ENABLE_PROGRESS_THREAD], + [$orte_enable_progress_thread], + [Whether we want an orte progress thread enabled]) ])dnl diff --git a/orte/include/orte/types.h b/orte/include/orte/types.h index 6338696c5a..385ebff23c 100644 --- a/orte/include/orte/types.h +++ b/orte/include/orte/types.h @@ -82,54 +82,27 @@ typedef uint32_t orte_vpid_t; #define ORTE_VPID_MAX UINT32_MAX-2 #define ORTE_VPID_MIN 0 -#if ORTE_ENABLE_EPOCH -typedef uint32_t orte_epoch_t; -#define ORTE_EPOCH_T OPAL_UINT32 -#define ORTE_EPOCH_MAX UINT32_MAX-2 -#define ORTE_EPOCH_MIN 0 -#endif - -#if ORTE_ENABLE_EPOCH -#define ORTE_PROCESS_NAME_HTON(n) \ -do { \ - n.jobid = htonl(n.jobid); \ - n.vpid = htonl(n.vpid); \ - n.epoch = htonl(n.epoch); \ -} while (0) -#else #define ORTE_PROCESS_NAME_HTON(n) \ do { \ n.jobid = htonl(n.jobid); \ n.vpid = htonl(n.vpid); \ } while (0) -#endif -#if ORTE_ENABLE_EPOCH -#define ORTE_PROCESS_NAME_NTOH(n) \ -do { \ - n.jobid = ntohl(n.jobid); \ - n.vpid = ntohl(n.vpid); \ - n.epoch = ntohl(n.epoch); \ -} while (0) -#else #define ORTE_PROCESS_NAME_NTOH(n) \ do { \ n.jobid = ntohl(n.jobid); \ n.vpid = ntohl(n.vpid); \ } while (0) -#endif #define ORTE_NAME_ARGS(n) \ (unsigned long) ((NULL == n) ? (unsigned long)ORTE_JOBID_INVALID : (unsigned long)(n)->jobid), \ (unsigned long) ((NULL == n) ? (unsigned long)ORTE_VPID_INVALID : (unsigned long)(n)->vpid) \ - (unsigned long) ((NULL == n) ? (unsigned long)ORTE_EPOCH_INVALID : (unsigned long)(n)->epoch) /* * define invalid values */ #define ORTE_JOBID_INVALID (ORTE_JOBID_MAX + 2) #define ORTE_VPID_INVALID (ORTE_VPID_MAX + 2) -#define ORTE_EPOCH_INVALID (ORTE_EPOCH_MAX + 2) #define ORTE_LOCAL_JOBID_INVALID (ORTE_JOBID_INVALID & 0x0000FFFF) /* @@ -137,7 +110,6 @@ do { \ */ #define ORTE_JOBID_WILDCARD (ORTE_JOBID_MAX + 1) #define ORTE_VPID_WILDCARD (ORTE_VPID_MAX + 1) -#define ORTE_EPOCH_WILDCARD (ORTE_EPOCH_MAX + 1) #define ORTE_LOCAL_JOBID_WILDCARD (ORTE_JOBID_WILDCARD & 0x0000FFFF) /* @@ -146,16 +118,6 @@ do { \ struct orte_process_name_t { orte_jobid_t jobid; /**< Job number */ orte_vpid_t vpid; /**< Process id - equivalent to rank */ -#if ORTE_ENABLE_EPOCH - orte_epoch_t epoch; /**< Epoch - used to measure the generation of a recovered process. - * The epoch will start at ORTE_EPOCH_MIN and - * increment every time the process is detected as - * having stopped (including normal shutdown). The - * HNP will be responsible for informing all - * processes that did not directly detect the - * failure to increment their epochs. - */ -#endif }; typedef struct orte_process_name_t orte_process_name_t; @@ -179,10 +141,6 @@ typedef void* orte_iov_base_ptr_t; #define ORTE_VPID (OPAL_DSS_ID_DYNAMIC + 3) /**< a vpid */ #define ORTE_JOBID (OPAL_DSS_ID_DYNAMIC + 4) /**< a jobid */ -#if ORTE_ENABLE_EPOCH -#define ORTE_EPOCH (OPAL_DSS_ID_DYNAMIC + 5) /**< an epoch */ -#endif - #if !ORTE_DISABLE_FULL_SUPPORT /* State-related types */ #define ORTE_NODE_STATE (OPAL_DSS_ID_DYNAMIC + 6) /**< node status flag */ @@ -205,11 +163,8 @@ typedef void* orte_iov_base_ptr_t; /* DAEMON command type */ #define ORTE_DAEMON_CMD (OPAL_DSS_ID_DYNAMIC + 19) /**< command flag for communicating with the daemon */ -/* GRPCOMM types */ -#define ORTE_GRPCOMM_MODE (OPAL_DSS_ID_DYNAMIC + 20) - /* IOF types */ -#define ORTE_IOF_TAG (OPAL_DSS_ID_DYNAMIC + 21) +#define ORTE_IOF_TAG (OPAL_DSS_ID_DYNAMIC + 20) /* provide a boundary for others to use */ diff --git a/orte/mca/errmgr/app/Makefile.am b/orte/mca/errmgr/app/Makefile.am deleted file mode 100644 index e164765296..0000000000 --- a/orte/mca/errmgr/app/Makefile.am +++ /dev/null @@ -1,36 +0,0 @@ -# -# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -EXTRA_DIST = .windows - -sources = \ - errmgr_app.h \ - errmgr_app_component.c \ - errmgr_app.c - -# Make the output library in this directory, and name it either -# mca__.la (for DSO builds) or libmca__.la -# (for static builds). - -if MCA_BUILD_orte_errmgr_app_DSO -component_noinst = -component_install = mca_errmgr_app.la -else -component_noinst = libmca_errmgr_app.la -component_install = -endif - -mcacomponentdir = $(pkglibdir) -mcacomponent_LTLIBRARIES = $(component_install) -mca_errmgr_app_la_SOURCES = $(sources) -mca_errmgr_app_la_LDFLAGS = -module -avoid-version - -noinst_LTLIBRARIES = $(component_noinst) -libmca_errmgr_app_la_SOURCES =$(sources) -libmca_errmgr_app_la_LDFLAGS = -module -avoid-version diff --git a/orte/mca/errmgr/app/errmgr_app.c b/orte/mca/errmgr/app/errmgr_app.c deleted file mode 100644 index 02fb1785b5..0000000000 --- a/orte/mca/errmgr/app/errmgr_app.c +++ /dev/null @@ -1,280 +0,0 @@ -/* - * Copyright (c) 2009-2011 The Trustees of Indiana University. - * All rights reserved. - * - * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. - * - * Copyright (c) 2004-2006 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "orte_config.h" - -#include -#ifdef HAVE_UNISTD_H -#include -#endif /* HAVE_UNISTD_H */ -#ifdef HAVE_STRING_H -#include -#endif - -#include "opal/util/output.h" -#include "opal/dss/dss.h" -#include "opal/mca/event/event.h" - -#include "orte/util/error_strings.h" -#include "orte/util/name_fns.h" -#include "orte/util/show_help.h" -#include "orte/util/nidmap.h" -#include "orte/runtime/orte_globals.h" -#include "orte/runtime/orte_wait.h" -#include "orte/mca/routed/routed.h" -#include "orte/mca/rml/rml.h" -#include "orte/mca/rml/rml_types.h" -#include "orte/mca/odls/odls_types.h" - -#include "orte/mca/errmgr/base/base.h" -#include "orte/mca/errmgr/base/errmgr_private.h" -#include "errmgr_app.h" - -/* - * Module functions: Global - */ -static int init(void); -static int finalize(void); - -static int update_state(orte_jobid_t job, - orte_job_state_t jobstate, - orte_process_name_t *proc_name, - orte_proc_state_t state, - pid_t pid, - orte_exit_code_t exit_code); - -static int orte_errmgr_app_abort_peers(orte_process_name_t *procs, - orte_std_cntr_t num_procs); - -void epoch_change_recv(int status, - orte_process_name_t *sender, - opal_buffer_t *buffer, - orte_rml_tag_t tag, - void *cbdata); -void epoch_change(int fd, - short event, - void *data); - -/****************** - * HNP module - ******************/ -orte_errmgr_base_module_t orte_errmgr_app_module = { - init, - finalize, - orte_errmgr_base_log, - orte_errmgr_base_abort, - orte_errmgr_app_abort_peers, - update_state, - NULL, - NULL, - NULL, - orte_errmgr_base_register_migration_warning -#if ORTE_RESIL_ORTE - ,orte_errmgr_base_set_fault_callback -#endif -}; - -/************************ - * API Definitions - ************************/ -static int init(void) -{ - int ret = ORTE_SUCCESS; - -#if ORTE_RESIL_ORTE - ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, - ORTE_RML_TAG_EPOCH_CHANGE, - ORTE_RML_PERSISTENT, - epoch_change_recv, - NULL); -#endif - - return ret; -} - -static int finalize(void) -{ -#if ORTE_RESIL_ORTE - orte_rml.recv_cancel(ORTE_NAME_WILDCARD, - ORTE_RML_TAG_EPOCH_CHANGE); -#endif - - return ORTE_SUCCESS; -} - -static int update_state(orte_jobid_t job, - orte_job_state_t jobstate, - orte_process_name_t *proc, - orte_proc_state_t state, - pid_t pid, - orte_exit_code_t exit_code) -{ - orte_ns_cmp_bitmask_t mask; - - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "%s errmgr:app: job %s reported state %s" - " for proc %s state %s exit_code %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(job), - orte_job_state_to_str(jobstate), - (NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc), - orte_proc_state_to_str(state), exit_code)); - - /* - * if orte is trying to shutdown, just let it - */ - if (orte_finalizing) { - return ORTE_SUCCESS; - } - - if (ORTE_PROC_STATE_COMM_FAILED == state) { - mask = ORTE_NS_CMP_ALL; - /* if it is our own connection, ignore it */ - if (OPAL_EQUAL == orte_util_compare_name_fields(mask, ORTE_PROC_MY_NAME, proc)) { - return ORTE_SUCCESS; - } - - /* delete the route */ - orte_routed.delete_route(proc); - /* see is this was a lifeline */ - if (ORTE_SUCCESS != orte_routed.route_lost(proc)) { - return ORTE_ERR_UNRECOVERABLE; - } - } - return ORTE_SUCCESS; -} - -#if ORTE_RESIL_ORTE -void epoch_change_recv(int status, - orte_process_name_t *sender, - opal_buffer_t *buffer, - orte_rml_tag_t tag, - void *cbdata) { - - ORTE_MESSAGE_EVENT(sender, buffer, tag, epoch_change); -} - -void epoch_change(int fd, - short event, - void *data) { - orte_message_event_t *mev = (orte_message_event_t *) data; - opal_buffer_t *buffer = mev->buffer; - orte_process_name_t *proc; - int n = 1, ret, num_dead, i; - opal_pointer_array_t *procs; - - if (orte_finalizing || orte_job_term_ordered || orte_orteds_term_ordered) { - return; - } - - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "%s errmgr:app Received epoch change notification", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - procs = OBJ_NEW(opal_pointer_array_t); - - if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &num_dead, &n, ORTE_VPID))) { - ORTE_ERROR_LOG(ret); - opal_output(0, "%s Error unpacking message.", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - return; - } - - proc = (orte_process_name_t *) malloc(sizeof(orte_process_name_t) * num_dead); - for (i = 0; i < num_dead; i++) { - if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &proc[i], &n, ORTE_NAME))) { - ORTE_ERROR_LOG(ret); - opal_output(0, "%s Error unpacking message.", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - return; - } - proc[i].epoch++; - orte_util_set_epoch(&proc[i], proc[i].epoch); - - opal_pointer_array_add(procs, &proc[i]); - - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "%s errmgr:app Epoch for %s updated", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&proc[i]))); - } - - if (NULL != fault_cbfunc && 0 < num_dead) { - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "%s errmgr:app Calling fault callback", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - (*fault_cbfunc)(procs); - } else if (NULL == fault_cbfunc) { - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "%s errmgr:app Calling fault callback failed (NULL pointer)!", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - } else { - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "%s errmgr:app Calling fault callback failed (num_dead <= 0)!", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - } - - free(proc); - OBJ_RELEASE(procs); -} -#endif - -static int orte_errmgr_app_abort_peers(orte_process_name_t *procs, orte_std_cntr_t num_procs) -{ - int ret, exit_status = ORTE_SUCCESS; - opal_buffer_t buffer; - orte_std_cntr_t i; - orte_daemon_cmd_flag_t command = ORTE_DAEMON_ABORT_PROCS_CALLED; - - /* - * Pack up the list of processes and send them to the HNP - */ - OBJ_CONSTRUCT(&buffer, opal_buffer_t); - - if (ORTE_SUCCESS != (ret = opal_dss.pack(&buffer, &command, 1, ORTE_DAEMON_CMD))) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - - /* pack number of processes */ - if (ORTE_SUCCESS != (ret = opal_dss.pack(&buffer, &(num_procs), 1, ORTE_STD_CNTR))) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - - /* Pack the list of names */ - for( i = 0; i < num_procs; ++i ) { - if (ORTE_SUCCESS != (ret = opal_dss.pack(&buffer, &(procs[i]), 1, ORTE_NAME))) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - } - - /* Send to HNP for termination */ - if (0 > (ret = orte_rml.send_buffer(ORTE_PROC_MY_HNP, &buffer, ORTE_RML_TAG_DAEMON, 0))) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - -cleanup: - OBJ_DESTRUCT(&buffer); - - return exit_status; -} diff --git a/orte/mca/errmgr/app/errmgr_app.h b/orte/mca/errmgr/app/errmgr_app.h deleted file mode 100644 index 4674b5bf24..0000000000 --- a/orte/mca/errmgr/app/errmgr_app.h +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. - * - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -/** - * @file - * - */ - -#ifndef MCA_ERRMGR_app_EXPORT_H -#define MCA_ERRMGR_app_EXPORT_H - -#include "orte_config.h" - -#include "orte/mca/errmgr/errmgr.h" - -BEGIN_C_DECLS - -/* - * Local Component structures - */ - -ORTE_MODULE_DECLSPEC extern orte_errmgr_base_component_t mca_errmgr_app_component; - -ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr_app_module; - -END_C_DECLS - -#endif /* MCA_ERRMGR_app_EXPORT_H */ diff --git a/orte/mca/errmgr/app/errmgr_app_component.c b/orte/mca/errmgr/app/errmgr_app_component.c deleted file mode 100644 index dda89e52b8..0000000000 --- a/orte/mca/errmgr/app/errmgr_app_component.c +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. - * - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "orte_config.h" -#include "opal/util/output.h" - -#include "orte/mca/errmgr/errmgr.h" -#include "orte/mca/errmgr/base/base.h" -#include "errmgr_app.h" - -/* - * Public string for version number - */ -const char *orte_errmgr_app_component_version_string = - "ORTE ERRMGR app MCA component version " ORTE_VERSION; - -/* - * Local functionality - */ -static int errmgr_app_open(void); -static int errmgr_app_close(void); -static int errmgr_app_component_query(mca_base_module_t **module, int *priority); - -/* - * Instantiate the public struct with all of our public information - * and pointer to our public functions in it - */ -orte_errmgr_base_component_t mca_errmgr_app_component = -{ - /* Handle the general mca_component_t struct containing - * meta information about the component itapp - */ - { - ORTE_ERRMGR_BASE_VERSION_3_0_0, - /* Component name and version */ - "app", - ORTE_MAJOR_VERSION, - ORTE_MINOR_VERSION, - ORTE_RELEASE_VERSION, - - /* Component open and close functions */ - errmgr_app_open, - errmgr_app_close, - errmgr_app_component_query - }, - { - /* The component is checkpoint ready */ - MCA_BASE_METADATA_PARAM_CHECKPOINT - }, - /* Verbosity level */ - 0, - /* opal_output handler */ - -1, - /* Default priority */ - 5 -}; - -static int errmgr_app_open(void) -{ - return ORTE_SUCCESS; -} - -static int errmgr_app_close(void) -{ - return ORTE_SUCCESS; -} - -static int errmgr_app_component_query(mca_base_module_t **module, int *priority) -{ - if (ORTE_PROC_IS_APP) { - /* keep our priority low so that other modules are higher - * and will run before us - */ - *priority = 5; - *module = (mca_base_module_t *)&orte_errmgr_app_module; - return ORTE_SUCCESS; - } - - *priority = -1; - *module = NULL; - return ORTE_ERROR; -} diff --git a/orte/mca/errmgr/base/errmgr_base_fns.c b/orte/mca/errmgr/base/errmgr_base_fns.c index b7d5aa4f1a..d102253e86 100644 --- a/orte/mca/errmgr/base/errmgr_base_fns.c +++ b/orte/mca/errmgr/base/errmgr_base_fns.c @@ -11,6 +11,8 @@ * All rights reserved. * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -100,13 +102,11 @@ void orte_errmgr_predicted_proc_construct(orte_errmgr_predicted_proc_t *item) { item->proc_name.vpid = ORTE_VPID_INVALID; item->proc_name.jobid = ORTE_JOBID_INVALID; - ORTE_EPOCH_SET(item->proc_name.epoch,ORTE_EPOCH_MIN); } void orte_errmgr_predicted_proc_destruct( orte_errmgr_predicted_proc_t *item) { item->proc_name.vpid = ORTE_VPID_INVALID; - ORTE_EPOCH_SET(item->proc_name.epoch,ORTE_EPOCH_INVALID); item->proc_name.jobid = ORTE_JOBID_INVALID; } @@ -142,13 +142,11 @@ OBJ_CLASS_INSTANCE(orte_errmgr_predicted_map_t, void orte_errmgr_predicted_map_construct(orte_errmgr_predicted_map_t *item) { item->proc_name.vpid = ORTE_VPID_INVALID; - ORTE_EPOCH_SET(item->proc_name.epoch,ORTE_EPOCH_MIN); item->proc_name.jobid = ORTE_JOBID_INVALID; item->node_name = NULL; item->map_proc_name.vpid = ORTE_VPID_INVALID; - ORTE_EPOCH_SET(item->map_proc_name.epoch,ORTE_EPOCH_MIN); item->map_proc_name.jobid = ORTE_JOBID_INVALID; item->map_node_name = NULL; @@ -159,7 +157,6 @@ void orte_errmgr_predicted_map_construct(orte_errmgr_predicted_map_t *item) void orte_errmgr_predicted_map_destruct( orte_errmgr_predicted_map_t *item) { item->proc_name.vpid = ORTE_VPID_INVALID; - ORTE_EPOCH_SET(item->proc_name.epoch,ORTE_EPOCH_INVALID); item->proc_name.jobid = ORTE_JOBID_INVALID; if( NULL != item->node_name ) { @@ -168,7 +165,6 @@ void orte_errmgr_predicted_map_destruct( orte_errmgr_predicted_map_t *item) } item->map_proc_name.vpid = ORTE_VPID_INVALID; - ORTE_EPOCH_SET(item->map_proc_name.epoch,ORTE_EPOCH_INVALID); item->map_proc_name.jobid = ORTE_JOBID_INVALID; if( NULL != item->map_node_name ) { @@ -200,17 +196,9 @@ void orte_errmgr_base_log(int error_code, char *filename, int line) return; } - if (NULL != orte_process_info.job_name) { - opal_output(0, "[[%s][%s][%s][%d]] ORTE_ERROR_LOG: %s in file %s at line %d", - orte_process_info.job_name, - (NULL == orte_process_info.job_instance) ? "NULL" : orte_process_info.job_instance, - (NULL == orte_process_info.executable) ? "NULL" : orte_process_info.executable, - orte_process_info.app_rank, errstring, filename, line); - } else { - opal_output(0, "%s ORTE_ERROR_LOG: %s in file %s at line %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - errstring, filename, line); - } + opal_output(0, "%s ORTE_ERROR_LOG: %s in file %s at line %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + errstring, filename, line); } #if WANT_PMI_SUPPORT @@ -290,19 +278,6 @@ void orte_errmgr_base_abort(int error_code, char *fmt, ...) /* No way to reach here */ } -int orte_errmgr_base_update_state(orte_jobid_t job, - orte_job_state_t jobstate, - orte_process_name_t *proc_name, - orte_proc_state_t state, - pid_t pid, - orte_exit_code_t exit_code) -{ - /* - * This is a stub function that is only meant to be called by tools, - * so it will always return success. - */ - return ORTE_SUCCESS; -} void orte_errmgr_base_register_migration_warning(struct timeval *tv) { /* stub function - ignore */ diff --git a/orte/mca/errmgr/base/errmgr_base_open.c b/orte/mca/errmgr/base/errmgr_base_open.c index de979a7ae1..535f1edce8 100644 --- a/orte/mca/errmgr/base/errmgr_base_open.c +++ b/orte/mca/errmgr/base/errmgr_base_open.c @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -64,7 +66,6 @@ orte_errmgr_base_module_t orte_errmgr_default_fns = { orte_errmgr_base_log, orte_errmgr_base_abort, orte_errmgr_base_abort_peers, - orte_errmgr_base_update_state, NULL, /* predicted_fault */ NULL, /* suggest_map_targets */ NULL, /* ft_event */ @@ -83,8 +84,6 @@ orte_errmgr_base_module_t orte_errmgr = { NULL, NULL, NULL, - NULL, - NULL, NULL }; diff --git a/orte/mca/errmgr/base/errmgr_base_tool.c b/orte/mca/errmgr/base/errmgr_base_tool.c index b86ca04b7b..6e8dd64bae 100644 --- a/orte/mca/errmgr/base/errmgr_base_tool.c +++ b/orte/mca/errmgr/base/errmgr_base_tool.c @@ -267,7 +267,6 @@ static int errmgr_base_tool_start_cmdline_listener(void) */ errmgr_cmdline_sender.jobid = ORTE_JOBID_INVALID; errmgr_cmdline_sender.vpid = ORTE_VPID_INVALID; - ORTE_EPOCH_SET(errmgr_cmdline_sender.epoch,ORTE_EPOCH_MIN); if (ORTE_SUCCESS != (ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_MIGRATE, 0, @@ -379,14 +378,12 @@ static void errmgr_base_tool_cmdline_process_recv(int fd, short event, void *cbd if( OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_NAME_INVALID, &errmgr_cmdline_sender) ) { swap_dest.jobid = errmgr_cmdline_sender.jobid; swap_dest.vpid = errmgr_cmdline_sender.vpid; - ORTE_EPOCH_SET(swap_dest.epoch,errmgr_cmdline_sender.epoch); errmgr_cmdline_sender = *sender; orte_errmgr_base_migrate_update(ORTE_ERRMGR_MIGRATE_STATE_ERR_INPROGRESS); errmgr_cmdline_sender.jobid = swap_dest.jobid; errmgr_cmdline_sender.vpid = swap_dest.vpid; - ORTE_EPOCH_SET(errmgr_cmdline_sender.epoch,swap_dest.epoch); goto cleanup; } diff --git a/orte/mca/errmgr/base/errmgr_private.h b/orte/mca/errmgr/base/errmgr_private.h index 433a47ea12..beb5c2687d 100644 --- a/orte/mca/errmgr/base/errmgr_private.h +++ b/orte/mca/errmgr/base/errmgr_private.h @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -69,18 +71,10 @@ ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr_default_fns; ORTE_DECLSPEC void orte_errmgr_base_log(int error_code, char *filename, int line); ORTE_DECLSPEC void orte_errmgr_base_abort(int error_code, char *fmt, ...) - __opal_attribute_format__(__printf__, 2, 3) - __opal_attribute_noreturn__; + __opal_attribute_format__(__printf__, 2, 3); ORTE_DECLSPEC int orte_errmgr_base_abort_peers(orte_process_name_t *procs, orte_std_cntr_t num_procs); -ORTE_DECLSPEC int orte_errmgr_base_update_state(orte_jobid_t job, - orte_job_state_t jobstate, - orte_process_name_t *proc_name, - orte_proc_state_t state, - pid_t pid, - orte_exit_code_t exit_code); - ORTE_DECLSPEC void orte_errmgr_base_register_migration_warning(struct timeval *tv); END_C_DECLS diff --git a/orte/mca/errmgr/default_app/configure.m4 b/orte/mca/errmgr/default_app/configure.m4 index 0306771fb8..2c242912fa 100644 --- a/orte/mca/errmgr/default_app/configure.m4 +++ b/orte/mca/errmgr/default_app/configure.m4 @@ -13,7 +13,7 @@ AC_DEFUN([MCA_orte_errmgr_default_app_CONFIG], [ AC_CONFIG_FILES([orte/mca/errmgr/default_app/Makefile]) - AS_IF([test "$orte_enable_resilient_code" = 0 -a "$orte_without_full_support" = 0], + AS_IF([test "$orte_without_full_support" = 0], [$1], [$2]) ]) diff --git a/orte/mca/errmgr/default_app/errmgr_default_app.c b/orte/mca/errmgr/default_app/errmgr_default_app.c index d5f88f0a94..9c96853edb 100644 --- a/orte/mca/errmgr/default_app/errmgr_default_app.c +++ b/orte/mca/errmgr/default_app/errmgr_default_app.c @@ -7,7 +7,8 @@ * Copyright (c) 2004-2006 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. - * + * Copyright (c) 2011 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -34,6 +35,7 @@ #include "orte/mca/rml/rml.h" #include "orte/mca/routed/routed.h" #include "orte/mca/odls/odls_types.h" +#include "orte/mca/state/state.h" #include "orte/mca/errmgr/base/base.h" #include "orte/mca/errmgr/base/errmgr_private.h" @@ -45,13 +47,6 @@ static int init(void); static int finalize(void); -static int update_state(orte_jobid_t job, - orte_job_state_t jobstate, - orte_process_name_t *proc_name, - orte_proc_state_t state, - pid_t pid, - orte_exit_code_t exit_code); - static int abort_peers(orte_process_name_t *procs, orte_std_cntr_t num_procs); @@ -64,7 +59,6 @@ orte_errmgr_base_module_t orte_errmgr_default_app_module = { orte_errmgr_base_log, orte_errmgr_base_abort, abort_peers, - update_state, NULL, NULL, NULL, @@ -72,11 +66,16 @@ orte_errmgr_base_module_t orte_errmgr_default_app_module = { NULL }; +static void proc_errors(int fd, short args, void *cbdata); + /************************ * API Definitions ************************/ static int init(void) { + /* setup state machine to trap proc errors */ + orte_state.add_proc_state(ORTE_PROC_STATE_ERROR, proc_errors, ORTE_ERROR_PRI); + return ORTE_SUCCESS; } @@ -85,43 +84,43 @@ static int finalize(void) return ORTE_SUCCESS; } -static int update_state(orte_jobid_t job, - orte_job_state_t jobstate, - orte_process_name_t *proc, - orte_proc_state_t state, - pid_t pid, - orte_exit_code_t exit_code) +static void proc_errors(int fd, short args, void *cbdata) { + orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; orte_ns_cmp_bitmask_t mask; OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "%s errmgr:default_app: job %s reported state %s" - " for proc %s state %s exit_code %d", + "%s errmgr:default_app: proc %s state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(job), - orte_job_state_to_str(jobstate), - (NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc), - orte_proc_state_to_str(state), exit_code)); + ORTE_NAME_PRINT(&caddy->name), + orte_proc_state_to_str(caddy->proc_state))); /* * if orte is trying to shutdown, just let it */ if (orte_finalizing) { - return ORTE_SUCCESS; + OBJ_RELEASE(caddy); + return; } - if (ORTE_PROC_STATE_COMM_FAILED == state) { + if (ORTE_PROC_STATE_COMM_FAILED == caddy->proc_state) { mask = ORTE_NS_CMP_ALL; /* if it is our own connection, ignore it */ - if (OPAL_EQUAL == orte_util_compare_name_fields(mask, ORTE_PROC_MY_NAME, proc)) { - return ORTE_SUCCESS; + if (OPAL_EQUAL == orte_util_compare_name_fields(mask, ORTE_PROC_MY_NAME, &caddy->name)) { + OBJ_RELEASE(caddy); + return; } /* see is this was a lifeline */ - if (ORTE_SUCCESS != orte_routed.route_lost(proc)) { - return ORTE_ERR_UNRECOVERABLE; + if (ORTE_SUCCESS != orte_routed.route_lost(&caddy->name)) { + /* order an exit */ + ORTE_ERROR_LOG(ORTE_ERR_UNRECOVERABLE); + OBJ_RELEASE(caddy); + exit(1); } } - return ORTE_SUCCESS; + + /* cleanup */ + OBJ_RELEASE(caddy); } static int abort_peers(orte_process_name_t *procs, orte_std_cntr_t num_procs) diff --git a/orte/mca/errmgr/default_hnp/configure.m4 b/orte/mca/errmgr/default_hnp/configure.m4 index 6dced14eb4..61a954c4ac 100644 --- a/orte/mca/errmgr/default_hnp/configure.m4 +++ b/orte/mca/errmgr/default_hnp/configure.m4 @@ -13,7 +13,7 @@ AC_DEFUN([MCA_orte_errmgr_default_hnp_CONFIG], [ AC_CONFIG_FILES([orte/mca/errmgr/default_hnp/Makefile]) - AS_IF([test "$orte_enable_resilient_code" = 0 -a "$orte_without_full_support" = 0], + AS_IF([test "$orte_without_full_support" = 0], [$1], [$2]) ]) diff --git a/orte/mca/errmgr/default_hnp/errmgr_default_hnp.c b/orte/mca/errmgr/default_hnp/errmgr_default_hnp.c index 232d3e250c..b58c06673e 100644 --- a/orte/mca/errmgr/default_hnp/errmgr_default_hnp.c +++ b/orte/mca/errmgr/default_hnp/errmgr_default_hnp.c @@ -7,7 +7,7 @@ * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2011 Oracle and/or all its affiliates. All rights reserved. - * Copyright (c) 2011 Los Alamos National Security, LLC. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. * All rights reserved. * $COPYRIGHT$ * @@ -44,6 +44,7 @@ #include "orte/mca/notifier/notifier.h" #include "orte/mca/grpcomm/grpcomm.h" #include "orte/mca/ess/ess.h" +#include "orte/mca/state/state.h" #include "orte/util/error_strings.h" #include "orte/util/name_fns.h" @@ -69,13 +70,6 @@ static int predicted_fault(opal_list_t *proc_list, opal_list_t *node_list, opal_list_t *suggested_map); -static int update_state(orte_jobid_t job, - orte_job_state_t jobstate, - orte_process_name_t *proc, - orte_proc_state_t state, - pid_t pid, - orte_exit_code_t exit_code); - static int suggest_map_targets(orte_proc_t *proc, orte_node_t *oldnode, opal_list_t *node_list); @@ -92,7 +86,6 @@ orte_errmgr_base_module_t orte_errmgr_default_hnp_module = { orte_errmgr_base_log, orte_errmgr_base_abort, orte_errmgr_base_abort_peers, - update_state, predicted_fault, suggest_map_targets, ft_event, @@ -104,24 +97,21 @@ orte_errmgr_base_module_t orte_errmgr_default_hnp_module = { /* * Local functions */ -static void default_hnp_abort(orte_jobid_t job, orte_exit_code_t exit_code); -static void failed_start(orte_job_t *jdata); -static void update_local_procs_in_job(orte_job_t *jdata, orte_job_state_t jobstate, - orte_proc_state_t state, orte_exit_code_t exit_code); -static void check_job_complete(orte_job_t *jdata); -static void killprocs(orte_jobid_t job, orte_vpid_t vpid); -static void update_proc(orte_job_t *jdata, - orte_process_name_t *proc, - orte_proc_state_t state, - pid_t pid, - orte_exit_code_t exit_code); - +static void default_hnp_abort(orte_job_t *jdata); +static void job_errors(int fd, short args, void *cbdata); +static void proc_errors(int fd, short args, void *cbdata); /********************** * From DEFAULT_HNP **********************/ static int init(void) { + /* setup state machine to trap job errors */ + orte_state.add_job_state(ORTE_JOB_STATE_ERROR, job_errors, ORTE_ERROR_PRI); + + /* setup state machine to trap proc errors */ + orte_state.add_proc_state(ORTE_PROC_STATE_ERROR, proc_errors, ORTE_ERROR_PRI); + return ORTE_SUCCESS; } @@ -130,359 +120,431 @@ static int finalize(void) return ORTE_SUCCESS; } -static int update_state(orte_jobid_t job, - orte_job_state_t jobstate, - orte_process_name_t *proc, - orte_proc_state_t state, - pid_t pid, - orte_exit_code_t exit_code) +static void job_errors(int fd, short args, void *cbdata) { + orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; orte_job_t *jdata; - orte_proc_t *pptr; + orte_job_state_t jobstate; orte_exit_code_t sts; + /* + * if orte is trying to shutdown, just let it + */ + if (orte_finalizing) { + return; + } + + /* if the jdata is NULL, then we abort as this + * is reporting an unrecoverable error + */ + if (NULL == caddy->jdata) { + ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_FORCED_EXIT); + OBJ_RELEASE(caddy); + return; + } + + /* update the state */ + jdata = caddy->jdata; + jobstate = caddy->job_state; + jdata->state = jobstate; + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "%s errmgr:default_hnp: job %s reported state %s" - " for proc %s state %s pid %d exit_code %d", + "%s errmgr:default_hnp: job %s reported state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(job), - orte_job_state_to_str(jobstate), - (NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc), - orte_proc_state_to_str(state), pid, exit_code)); + ORTE_JOBID_PRINT(jdata->jobid), + orte_job_state_to_str(jobstate))); + + /* set global flags */ + if (ORTE_PROC_MY_NAME->jobid == jdata->jobid && !orte_abnormal_term_ordered) { + /* set the flag indicating that a daemon failed so we use the proper + * methods for attempting to shutdown the rest of the system + */ + orte_abnormal_term_ordered = true; + } + if (ORTE_JOB_STATE_NEVER_LAUNCHED == jobstate) { + orte_never_launched = true; + jdata->num_terminated = jdata->num_procs; + ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_TERMINATED); + OBJ_RELEASE(caddy); + return; + } + + if (ORTE_JOB_STATE_FAILED_TO_START == jobstate || + ORTE_JOB_STATE_FAILED_TO_LAUNCH == jobstate) { + /* the job object for this job will have been NULL'd + * in the array if the job was solely local. If it isn't + * NULL, then we need to tell everyone else to die + */ + if (NULL != jdata->aborted_proc) { + sts = jdata->aborted_proc->exit_code; + if (ORTE_PROC_MY_NAME->jobid == jdata->jobid && !orte_abnormal_term_ordered) { + /* set the flag indicating that a daemon failed so we use the proper + * methods for attempting to shutdown the rest of the system + */ + orte_abnormal_term_ordered = true; + if (WIFSIGNALED(sts)) { /* died on signal */ +#ifdef WCOREDUMP + if (WCOREDUMP(sts)) { + orte_show_help("help-plm-base.txt", "daemon-died-signal-core", true, + WTERMSIG(sts)); + sts = WTERMSIG(sts); + } else { + orte_show_help("help-plm-base.txt", "daemon-died-signal", true, + WTERMSIG(sts)); + sts = WTERMSIG(sts); + } +#else + orte_show_help("help-plm-base.txt", "daemon-died-signal", true, + WTERMSIG(sts)); + sts = WTERMSIG(sts); +#endif /* WCOREDUMP */ + } else { + orte_show_help("help-plm-base.txt", "daemon-died-no-signal", true, + WEXITSTATUS(sts)); + sts = WEXITSTATUS(sts); + } + } + } + } + + /* abort the job */ + ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_FORCED_EXIT); + OBJ_RELEASE(caddy); +} + +static void cleanup_local_proc(orte_job_t *jdata, + orte_process_name_t *proc) +{ + orte_proc_t *pptr; + int i; + + /* see if this is a local proc to me */ + for (i=0; i < orte_local_children->size; i++) { + if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { + continue; + } + if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, proc, &pptr->name)) { + opal_pointer_array_set_item(orte_local_children, i, NULL); + OBJ_RELEASE(pptr); + jdata->num_local_procs--; + return; + } + } +} + +static void proc_errors(int fd, short args, void *cbdata) +{ + orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; + orte_job_t *jdata; + orte_proc_t *pptr, *proct; + orte_process_name_t *proc = &caddy->name; + orte_proc_state_t state = caddy->proc_state; + int i; + + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, + "%s errmgr:default_hnp: for proc %s state %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc), + orte_proc_state_to_str(state))); /* * if orte is trying to shutdown, just let it */ if (orte_finalizing) { - return ORTE_SUCCESS; + goto cleanup; } - if (NULL == proc) { - /* this is an update for an entire local job */ - if (ORTE_JOBID_INVALID == job) { - /* whatever happened, we don't know what job - * it happened to - */ - if (ORTE_JOB_STATE_NEVER_LAUNCHED == jobstate) { - orte_never_launched = true; - } - orte_show_help("help-orte-errmgr.txt", "errmgr:unknown-job-error", - true, orte_job_state_to_str(jobstate)); - default_hnp_abort(job, exit_code); - return ORTE_SUCCESS; - } - - /* get the job object */ - if (NULL == (jdata = orte_get_job_data_object(job))) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - return ORTE_ERR_NOT_FOUND; - } - /* update the state */ - jdata->state = jobstate; - - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "%s errmgr:default_hnp: job %s reported state %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(jdata->jobid), - orte_job_state_to_str(jobstate))); - - switch (jobstate) { - case ORTE_JOB_STATE_TERMINATED: - /* support batch-operated jobs */ - update_local_procs_in_job(jdata, jobstate, ORTE_PROC_STATE_TERMINATED, 0); - jdata->num_terminated = jdata->num_procs; - check_job_complete(jdata); - break; - - case ORTE_JOB_STATE_ABORTED: - /* support batch-operated jobs */ - update_local_procs_in_job(jdata, jobstate, ORTE_PROC_STATE_ABORTED, exit_code); - /* order all local procs for this job to be killed */ - killprocs(jdata->jobid, ORTE_VPID_WILDCARD); - jdata->num_terminated = jdata->num_procs; - check_job_complete(jdata); - break; - - case ORTE_JOB_STATE_FAILED_TO_START: - failed_start(jdata); - check_job_complete(jdata); /* set the local proc states */ - /* the job object for this job will have been NULL'd - * in the array if the job was solely local. If it isn't - * NULL, then we need to tell everyone else to die - */ - if (NULL != (jdata = orte_get_job_data_object(job))) { - sts = exit_code; - if (ORTE_PROC_MY_NAME->jobid == job && !orte_abnormal_term_ordered) { - /* set the flag indicating that a daemon failed so we use the proper - * methods for attempting to shutdown the rest of the system - */ - orte_abnormal_term_ordered = true; - if (WIFSIGNALED(exit_code)) { /* died on signal */ -#ifdef WCOREDUMP - if (WCOREDUMP(exit_code)) { - orte_show_help("help-plm-base.txt", "daemon-died-signal-core", true, - WTERMSIG(exit_code)); - sts = WTERMSIG(exit_code); - } else { - orte_show_help("help-plm-base.txt", "daemon-died-signal", true, - WTERMSIG(exit_code)); - sts = WTERMSIG(exit_code); - } -#else - orte_show_help("help-plm-base.txt", "daemon-died-signal", true, - WTERMSIG(exit_code)); - sts = WTERMSIG(exit_code); -#endif /* WCOREDUMP */ - } else { - orte_show_help("help-plm-base.txt", "daemon-died-no-signal", true, - WEXITSTATUS(exit_code)); - sts = WEXITSTATUS(exit_code); - } - } - default_hnp_abort(jdata->jobid, sts); - } - break; - - case ORTE_JOB_STATE_SILENT_ABORT: - failed_start(jdata); - check_job_complete(jdata); /* set the local proc states */ - /* the job object for this job will have been NULL'd - * in the array if the job was solely local. If it isn't - * NULL, then we need to tell everyone else to die - */ - if (NULL != (jdata = orte_get_job_data_object(job))) { - if (ORTE_PROC_MY_NAME->jobid == job && !orte_abnormal_term_ordered) { - /* set the flag indicating that a daemon failed so we use the proper - * methods for attempting to shutdown the rest of the system - */ - orte_abnormal_term_ordered = true; - } - default_hnp_abort(jdata->jobid, exit_code); - } - break; - - case ORTE_JOB_STATE_RUNNING: - /* update all procs in job */ - update_local_procs_in_job(jdata, jobstate, ORTE_PROC_STATE_RUNNING, 0); - /* record that we reported */ - jdata->num_daemons_reported++; - /* report if requested */ - if (orte_report_launch_progress) { - if (0 == jdata->num_daemons_reported % 100 || jdata->num_daemons_reported == orte_process_info.num_procs) { - opal_output(orte_clean_output, "Reported: %d (out of %d) daemons - %d (out of %d) procs", - (int)jdata->num_daemons_reported, (int)orte_process_info.num_procs, - (int)jdata->num_launched, (int)jdata->num_procs); - } - } - break; - case ORTE_JOB_STATE_NEVER_LAUNCHED: - orte_never_launched = true; - jdata->num_terminated = jdata->num_procs; - check_job_complete(jdata); /* set the local proc states */ - /* the job object for this job will have been NULL'd - * in the array if the job was solely local. If it isn't - * NULL, then we need to tell everyone else to die - */ - if (NULL != (jdata = orte_get_job_data_object(job))) { - default_hnp_abort(jdata->jobid, exit_code); - } - break; - case ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED: - /* update all procs in job */ - update_local_procs_in_job(jdata, jobstate, - ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED, - exit_code); - /* order all local procs for this job to be killed */ - killprocs(jdata->jobid, ORTE_VPID_WILDCARD); - check_job_complete(jdata); /* set the local proc states */ - /* the job object for this job will have been NULL'd - * in the array if the job was solely local. If it isn't - * NULL, then we need to tell everyone else to die - */ - if (NULL != (jdata = orte_get_job_data_object(job))) { - default_hnp_abort(jdata->jobid, exit_code); - } - break; - case ORTE_JOB_STATE_COMM_FAILED: - /* order all local procs for this job to be killed */ - killprocs(jdata->jobid, ORTE_VPID_WILDCARD); - check_job_complete(jdata); /* set the local proc states */ - /* the job object for this job will have been NULL'd - * in the array if the job was solely local. If it isn't - * NULL, then we need to tell everyone else to die - */ - if (NULL != (jdata = orte_get_job_data_object(job))) { - default_hnp_abort(jdata->jobid, exit_code); - } - break; - case ORTE_JOB_STATE_HEARTBEAT_FAILED: - /* order all local procs for this job to be killed */ - killprocs(jdata->jobid, ORTE_VPID_WILDCARD); - check_job_complete(jdata); /* set the local proc states */ - /* the job object for this job will have been NULL'd - * in the array if the job was solely local. If it isn't - * NULL, then we need to tell everyone else to die - */ - if (NULL != (jdata = orte_get_job_data_object(job))) { - default_hnp_abort(jdata->jobid, exit_code); - } - break; - - default: - break; - } - return ORTE_SUCCESS; - } - /* get the job object */ if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) { /* if the orteds are terminating, check job complete */ if (orte_orteds_term_ordered) { opal_output(0, "TERM ORDERED - CHECKING COMPLETE"); - check_job_complete(NULL); - return ORTE_SUCCESS; + goto cleanup; } else { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - return ORTE_ERR_NOT_FOUND; + goto cleanup; } } + pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid); - /* update is for a specific proc */ - switch (state) { - case ORTE_PROC_STATE_ABORTED: - case ORTE_PROC_STATE_ABORTED_BY_SIG: - case ORTE_PROC_STATE_TERM_WO_SYNC: - update_proc(jdata, proc, state, pid, exit_code); - /* kill all local procs */ - killprocs(proc->jobid, ORTE_VPID_WILDCARD); - check_job_complete(jdata); /* need to set the job state */ - /* the job object for this job will have been NULL'd - * in the array if the job was solely local. If it isn't - * NULL, then we need to tell everyone else to die - */ - if (NULL != (jdata = orte_get_job_data_object(proc->jobid))) { - default_hnp_abort(jdata->jobid, exit_code); - } - break; - - case ORTE_PROC_STATE_FAILED_TO_START: - case ORTE_PROC_STATE_CALLED_ABORT: - update_proc(jdata, proc, state, pid, exit_code); - check_job_complete(jdata); - /* the job object for this job will have been NULL'd - * in the array if the job was solely local. If it isn't - * NULL, then we need to tell everyone else to die - */ - if (NULL != (jdata = orte_get_job_data_object(proc->jobid))) { - default_hnp_abort(jdata->jobid, exit_code); - } - break; - - case ORTE_PROC_STATE_REGISTERED: - case ORTE_PROC_STATE_RUNNING: - update_proc(jdata, proc, state, pid, exit_code); - break; - - case ORTE_PROC_STATE_LAUNCHED: - /* record the pid for this child */ - update_proc(jdata, proc, state, pid, exit_code); - break; - - case ORTE_PROC_STATE_TERMINATED: - case ORTE_PROC_STATE_TERM_NON_ZERO: - case ORTE_PROC_STATE_KILLED_BY_CMD: - update_proc(jdata, proc, state, pid, exit_code); - check_job_complete(jdata); - break; - - case ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED: - /* kill all jobs */ - update_proc(jdata, proc, state, pid, exit_code); - /* kill all local procs */ - killprocs(proc->jobid, ORTE_VPID_WILDCARD); - check_job_complete(jdata); /* need to set the job state */ - /* the job object for this job will have been NULL'd - * in the array if the job was solely local. If it isn't - * NULL, then we need to tell everyone else to die - */ - if (NULL != (jdata = orte_get_job_data_object(proc->jobid))) { - default_hnp_abort(jdata->jobid, exit_code); - } - break; - - case ORTE_PROC_STATE_COMM_FAILED: + /* we MUST handle a communication failure before doing anything else + * as it requires some special care to avoid normal termination issues + * for local application procs + */ + if (ORTE_PROC_STATE_COMM_FAILED == state) { /* is this to a daemon? */ if (ORTE_PROC_MY_NAME->jobid != proc->jobid) { /* nope - ignore it */ - break; + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s Comm failure to non-daemon proc - ignoring it", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + goto cleanup; } /* if this is my own connection, ignore it */ if (ORTE_PROC_MY_NAME->vpid == proc->vpid) { OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s My own connection - ignoring it", + "%s Comm failure on my own connection - ignoring it", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - break; + goto cleanup; } - /* if we have ordered orteds to terminate, record it */ - if (orte_orteds_term_ordered) { + /* if we have ordered orteds to terminate or abort + * is in progress, record it */ + if (orte_orteds_term_ordered || orte_abnormal_term_ordered) { OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s Daemons terminating - recording daemon %s as gone", + "%s Comm failure: daemons terminating - recording daemon %s as gone", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); /* remove from dependent routes, if it is one */ orte_routed.route_lost(proc); - /* update daemon job */ - if (NULL != (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) { - if (pptr->state < ORTE_PROC_STATE_TERMINATED) { - pptr->state = state; - jdata->num_terminated++; + /* if all my routes and local children are gone, then terminate ourselves */ + if (0 == orte_routed.num_routes()) { + for (i=0; i < orte_local_children->size; i++) { + if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) && + proct->alive && proct->state < ORTE_PROC_STATE_UNTERMINATED) { + /* at least one is still alive */ + goto cleanup; } - } - /* check if complete */ - check_job_complete(jdata); - break; + } + /* call our appropriate exit procedure */ + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr_hnp: all routes and children gone - ordering exit", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED); + } + goto cleanup; } - /* if abort is in progress, see if this one failed to tell - * us it had terminated + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s Comm failure: daemon %s - aborting", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); + /* record the first one to fail */ + if (!jdata->abort) { + jdata->state = ORTE_JOB_STATE_COMM_FAILED; + /* point to the lowest rank to cause the problem */ + jdata->aborted_proc = pptr; + /* retain the object so it doesn't get free'd */ + OBJ_RETAIN(pptr); + jdata->abort = true; + ORTE_UPDATE_EXIT_STATUS(pptr->exit_code); + } + /* abort the system */ + default_hnp_abort(jdata); + goto cleanup; + } + + /* update the proc state - can get multiple reports on a proc + * depending on circumstances, so ensure we only do this once + */ + if (pptr->state < ORTE_PROC_STATE_TERMINATED) { + pptr->state = state; + jdata->num_terminated++; + } + /* since we only come here if the proc terminated, + * cleanup the local proc, if required + */ + cleanup_local_proc(jdata, proc); + + /* ensure we record the failed proc properly so we can report + * the error once we terminate + */ + switch (state) { + case ORTE_PROC_STATE_KILLED_BY_CMD: + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:hnp: proc %s killed by cmd", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc))); + /* we ordered this proc to die, so it isn't an abnormal termination + * and we don't flag it as such */ - if (orte_abnormal_term_ordered) { - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s Abort in progress - recording daemon %s as gone", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); - /* remove from dependent routes, if it is one */ - orte_routed.route_lost(proc); - /* update daemon job */ - if (NULL != (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) { - if (pptr->state < ORTE_PROC_STATE_TERMINATED) { - pptr->state = state; - jdata->num_terminated++; - } + if (jdata->num_terminated >= jdata->num_procs) { + /* this job has terminated */ + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED); + } + /* don't abort the job as this isn't an abnormal termination */ + break; + + case ORTE_PROC_STATE_ABORTED: + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:hnp: proc %s aborted", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc))); + if (!jdata->abort) { + jdata->state = ORTE_JOB_STATE_ABORTED; + /* point to the first rank to cause the problem */ + jdata->aborted_proc = pptr; + /* retain the object so it doesn't get free'd */ + OBJ_RETAIN(pptr); + jdata->abort = true; + ORTE_UPDATE_EXIT_STATUS(pptr->exit_code); + } + /* abnormal termination - abort */ + default_hnp_abort(jdata); + break; + + case ORTE_PROC_STATE_ABORTED_BY_SIG: + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:hnp: proc %s aborted by signal", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc))); + if (!jdata->abort) { + jdata->state = ORTE_JOB_STATE_ABORTED_BY_SIG; + /* point to the first rank to cause the problem */ + jdata->aborted_proc = pptr; + /* retain the object so it doesn't get free'd */ + OBJ_RETAIN(pptr); + jdata->abort = true; + ORTE_UPDATE_EXIT_STATUS(pptr->exit_code); + } + /* abnormal termination - abort */ + default_hnp_abort(jdata); + break; + + case ORTE_PROC_STATE_TERM_WO_SYNC: + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:hnp: proc %s terminated without sync", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc))); + if (!jdata->abort) { + jdata->state = ORTE_JOB_STATE_ABORTED_WO_SYNC; + /* point to the first rank to cause the problem */ + jdata->aborted_proc = pptr; + /* retain the object so it doesn't get free'd */ + OBJ_RETAIN(pptr); + jdata->abort = true; + ORTE_UPDATE_EXIT_STATUS(pptr->exit_code); + /* now treat a special case - if the proc exit'd without a required + * sync, it may have done so with a zero exit code. We want to ensure + * that the user realizes there was an error, so in this -one- case, + * we overwrite the process' exit code with the default error code + */ + ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); + } + /* abnormal termination - abort */ + default_hnp_abort(jdata); + break; + + case ORTE_PROC_STATE_FAILED_TO_START: + case ORTE_PROC_STATE_FAILED_TO_LAUNCH: + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:hnp: proc %s %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc), + orte_proc_state_to_str(state))); + if (!jdata->abort) { + if (ORTE_PROC_STATE_FAILED_TO_START) { + jdata->state = ORTE_JOB_STATE_FAILED_TO_START; + } else { + jdata->state = ORTE_JOB_STATE_FAILED_TO_LAUNCH; + } + /* point to the first rank to cause the problem */ + jdata->aborted_proc = pptr; + /* retain the object so it doesn't get free'd */ + OBJ_RETAIN(pptr); + jdata->abort = true; + ORTE_UPDATE_EXIT_STATUS(pptr->exit_code); + } + /* abnormal termination - abort */ + default_hnp_abort(jdata); + break; + + case ORTE_PROC_STATE_CALLED_ABORT: + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:hnp: proc %s called abort", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc))); + if (!jdata->abort) { + jdata->state = ORTE_JOB_STATE_CALLED_ABORT; + /* point to the first proc to cause the problem */ + jdata->aborted_proc = pptr; + /* retain the object so it doesn't get free'd */ + OBJ_RETAIN(pptr); + jdata->abort = true; + ORTE_UPDATE_EXIT_STATUS(pptr->exit_code); + } + /* abnormal termination - abort */ + default_hnp_abort(jdata); + break; + + case ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED: + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:hnp: proc %s exceeded sensor boundary", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc))); + if (!jdata->abort) { + jdata->state = ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED; + /* point to the lowest rank to cause the problem */ + jdata->aborted_proc = pptr; + /* retain the object so it doesn't get free'd */ + OBJ_RETAIN(pptr); + jdata->abort = true; + ORTE_UPDATE_EXIT_STATUS(pptr->exit_code); + } + /* abnormal termination - abort */ + default_hnp_abort(jdata); + break; + + case ORTE_PROC_STATE_TERM_NON_ZERO: + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:hnp: proc %s exited with non-zero status %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc), + pptr->exit_code)); + ORTE_UPDATE_EXIT_STATUS(pptr->exit_code); + /* track the number of non-zero exits */ + jdata->num_non_zero_exit++; + if (orte_abort_non_zero_exit) { + if (!jdata->abort) { + jdata->state = ORTE_JOB_STATE_NON_ZERO_TERM; + /* point to the first rank to cause the problem */ + jdata->aborted_proc = pptr; + /* retain the object so it doesn't get free'd */ + OBJ_RETAIN(pptr); + jdata->abort = true; + } + /* user requested we abort in this scenario */ + default_hnp_abort(jdata); + } else { + /* user requested we consider this normal termination */ + if (jdata->num_terminated >= jdata->num_procs) { + /* this job has terminated */ + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED); } - /* check if complete */ - check_job_complete(jdata); } break; case ORTE_PROC_STATE_HEARTBEAT_FAILED: - /* heartbeats are only for daemons */ - if (NULL != (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) { - if (pptr->state < ORTE_PROC_STATE_TERMINATED) { - pptr->state = state; - jdata->num_terminated++; - } + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:hnp: proc %s heartbeat failed", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc))); + if (!jdata->abort) { + jdata->state = ORTE_JOB_STATE_HEARTBEAT_FAILED; + /* point to the first rank to cause the problem */ + jdata->aborted_proc = pptr; + /* retain the object so it doesn't get free'd */ + OBJ_RETAIN(pptr); + jdata->abort = true; + ORTE_UPDATE_EXIT_STATUS(pptr->exit_code); } /* remove from dependent routes, if it is one */ orte_routed.route_lost(proc); - /* kill all local procs */ - killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); /* kill all jobs */ - default_hnp_abort(ORTE_JOBID_WILDCARD, exit_code); - return ORTE_ERR_UNRECOVERABLE; + default_hnp_abort(jdata); + break; default: + /* shouldn't get this, but terminate job if required */ + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:hnp: proc %s default error %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc), + orte_proc_state_to_str(state))); + if (jdata->num_terminated == jdata->num_procs) { + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED); + } break; } - return ORTE_SUCCESS; + cleanup: + OBJ_RELEASE(caddy); } static int predicted_fault(opal_list_t *proc_list, @@ -507,35 +569,56 @@ static int ft_event(int state) /***************** * Local Functions *****************/ -static void default_hnp_abort(orte_jobid_t job, orte_exit_code_t exit_code) +static void default_hnp_abort(orte_job_t *jdata) { int rc; /* if we are already in progress, then ignore this call */ if (opal_atomic_trylock(&orte_abort_inprogress_lock)) { /* returns 1 if already locked */ OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "%s errmgr:default_hnp: abort in progress, ignoring abort on job %s with status %d", + "%s errmgr:default_hnp: abort in progress, ignoring abort on job %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(job), exit_code)); + ORTE_JOBID_PRINT(jdata->jobid))); return; } OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "%s errmgr:default_hnp: abort called on job %s with status %d", + "%s errmgr:default_hnp: abort called on job %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(job), exit_code)); + ORTE_JOBID_PRINT(jdata->jobid))); + + /* the job aborted - turn off any sensors on this job */ + orte_sensor.stop(jdata->jobid); /* set control params to indicate we are terminating */ orte_job_term_ordered = true; - orte_abnormal_term_ordered = true; orte_enable_recovery = false; - /* set the exit status, just in case whomever called us failed - * to do so - it can only be done once, so we are protected - * from overwriting it + /* if it is the daemon job that aborted, then we need + * to flag an abnormal term - otherwise, just abort + * the job cleanly */ - ORTE_UPDATE_EXIT_STATUS(exit_code); + if (ORTE_PROC_MY_NAME->jobid == jdata->jobid) { + orte_abnormal_term_ordered = true; + } + if (0 < jdata->num_non_zero_exit) { + /* warn user */ + opal_output(orte_clean_output, + "-------------------------------------------------------\n" + "%s job %s terminated normally, but %d %s. Per user-direction, the job has been aborted.\n" + "-------------------------------------------------------", + (1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "Primary" : "Child", + (1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "" : ORTE_LOCAL_JOBID_PRINT(jdata->jobid), + jdata->num_non_zero_exit, + (1 == jdata->num_non_zero_exit) ? "process returned\na non-zero exit code." : + "processes returned\nnon-zero exit codes."); + } + + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, + "%s errmgr:default_hnp: ordering orted termination", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + /* tell the plm to terminate the orteds - they will automatically * kill their local procs */ @@ -543,691 +626,3 @@ static void default_hnp_abort(orte_jobid_t job, orte_exit_code_t exit_code) ORTE_ERROR_LOG(rc); } } - -static void failed_start(orte_job_t *jdata) -{ - opal_list_item_t *item, *next; - orte_odls_job_t *jobdat; - orte_odls_child_t *child; - orte_proc_t *proc; - - /* lookup the local jobdat for this job */ - jobdat = NULL; - for (item = opal_list_get_first(&orte_local_jobdata); - item != opal_list_get_end(&orte_local_jobdata); - item = opal_list_get_next(item)) { - jobdat = (orte_odls_job_t*)item; - - /* is this the specified job? */ - if (jobdat->jobid == jdata->jobid) { - break; - } - } - if (NULL == jobdat) { - /* race condition - may not have been formed yet */ - return; - } - jobdat->state = ORTE_JOB_STATE_FAILED_TO_START; - - OPAL_THREAD_LOCK(&orte_odls_globals.mutex); - - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = next) { - next = opal_list_get_next(item); - child = (orte_odls_child_t*)item; - if (child->name->jobid == jobdat->jobid) { - if (ORTE_PROC_STATE_LAUNCHED > child->state || - ORTE_PROC_STATE_UNTERMINATED < child->state) { - /* get the master proc object */ - proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, child->name->vpid); - proc->state = child->state; - proc->exit_code = child->exit_code; - /* update the counter so we can terminate */ - jdata->num_terminated++; - /* remove the child from our list */ - opal_list_remove_item(&orte_local_children, &child->super); - OBJ_RELEASE(child); - jobdat->num_local_procs--; - } - } - } - - opal_condition_signal(&orte_odls_globals.cond); - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); - - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "%s errmgr:default_hnp: job %s reported incomplete start", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(jdata->jobid))); -} - -static void update_local_procs_in_job(orte_job_t *jdata, orte_job_state_t jobstate, - orte_proc_state_t state, orte_exit_code_t exit_code) -{ - opal_list_item_t *item, *next; - orte_odls_job_t *jobdat; - orte_odls_child_t *child; - orte_proc_t *proc; - - /* lookup the local jobdat for this job */ - jobdat = NULL; - for (item = opal_list_get_first(&orte_local_jobdata); - item != opal_list_get_end(&orte_local_jobdata); - item = opal_list_get_next(item)) { - jobdat = (orte_odls_job_t*)item; - - /* is this the specified job? */ - if (jobdat->jobid == jdata->jobid) { - break; - } - } - if (NULL == jobdat) { - /* race condition - may not have been formed yet */ - return; - } - jobdat->state = jobstate; - jdata->state = jobstate; - - OPAL_THREAD_LOCK(&orte_odls_globals.mutex); - - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = next) { - next = opal_list_get_next(item); - child = (orte_odls_child_t*)item; - if (jdata->jobid == child->name->jobid) { - child->state = state; - proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, child->name->vpid); - proc->state = state; - if (proc->exit_code < exit_code) { - proc->exit_code = exit_code; - } - if (ORTE_PROC_STATE_UNTERMINATED < state) { - opal_list_remove_item(&orte_local_children, &child->super); - OBJ_RELEASE(child); - jdata->num_terminated++; - jobdat->num_local_procs--; - } else if (ORTE_PROC_STATE_RUNNING) { - jdata->num_launched++; - } else if (ORTE_PROC_STATE_REGISTERED == state) { - jdata->num_reported++; - if (jdata->dyn_spawn_active && - jdata->num_reported == jdata->num_procs) { - OPAL_RELEASE_THREAD(&jdata->dyn_spawn_lock, - &jdata->dyn_spawn_cond, - &jdata->dyn_spawn_active); - } - } - } - } - - opal_condition_signal(&orte_odls_globals.cond); - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); - -} - -static void update_proc(orte_job_t *jdata, - orte_process_name_t *proc, - orte_proc_state_t state, - pid_t pid, - orte_exit_code_t exit_code) -{ - opal_list_item_t *item, *next; - orte_odls_child_t *child; - orte_proc_t *proct; - orte_odls_job_t *jobdat, *jdat; - int i; - - jobdat = NULL; - for (item = opal_list_get_first(&orte_local_jobdata); - item != opal_list_get_end(&orte_local_jobdata); - item = opal_list_get_next(item)) { - jdat = (orte_odls_job_t*)item; - if (jdat->jobid == jdata->jobid) { - jobdat = jdat; - break; - } - } - if (NULL == jobdat) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - } - - OPAL_THREAD_LOCK(&orte_odls_globals.mutex); - - /*** UPDATE LOCAL CHILD ***/ - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = next) { - next = opal_list_get_next(item); - child = (orte_odls_child_t*)item; - if (child->name->jobid == proc->jobid) { - if (child->name->vpid == proc->vpid) { - child->state = state; - if (0 < pid) { - child->pid = pid; - } - child->exit_code = exit_code; - proct = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, child->name->vpid); - proct->state = state; - if (0 < pid) { - proct->pid = pid; - } - proct->exit_code = exit_code; - if (ORTE_PROC_STATE_UNTERMINATED < state) { - opal_list_remove_item(&orte_local_children, &child->super); - OBJ_RELEASE(child); - if (NULL != jobdat) { - jobdat->num_local_procs--; - } - jdata->num_terminated++; - } else if (ORTE_PROC_STATE_RUNNING == state) { - jdata->num_launched++; - if (jdata->num_launched == jdata->num_procs) { - jdata->state = ORTE_JOB_STATE_RUNNING; - } - } else if (ORTE_PROC_STATE_REGISTERED == state) { - jdata->num_reported++; - if (jdata->dyn_spawn_active && - jdata->num_reported == jdata->num_procs) { - OPAL_RELEASE_THREAD(&jdata->dyn_spawn_lock, - &jdata->dyn_spawn_cond, - &jdata->dyn_spawn_active); - } - } - return; - } - } - } - - opal_condition_signal(&orte_odls_globals.cond); - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); - - /*** UPDATE REMOTE CHILD ***/ - for (i=0; i < jdata->procs->size; i++) { - if (NULL == (proct = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) { - continue; - } - if (proct->name.jobid != proc->jobid || - proct->name.vpid != proc->vpid) { - continue; - } - proct->state = state; - if (0 < pid) { - proct->pid = pid; - } - proct->exit_code = exit_code; - if (ORTE_PROC_STATE_REGISTERED == state) { - jdata->num_reported++; - if (jdata->dyn_spawn_active && - jdata->num_reported == jdata->num_procs) { - OPAL_RELEASE_THREAD(&jdata->dyn_spawn_lock, - &jdata->dyn_spawn_cond, - &jdata->dyn_spawn_active); - } - } else if (ORTE_PROC_STATE_UNTERMINATED < state) { - /* update the counter so we can terminate */ - jdata->num_terminated++; - } else if (ORTE_PROC_STATE_RUNNING == state) { - jdata->num_launched++; - if (jdata->num_launched == jdata->num_procs) { - jdata->state = ORTE_JOB_STATE_RUNNING; - } - } - return; - } -} - -static void check_job_complete(orte_job_t *jdata) -{ - orte_proc_t *proc; - int i; - orte_std_cntr_t j; - orte_job_t *job; - orte_node_t *node; - orte_job_map_t *map; - orte_std_cntr_t index; - bool one_still_alive; - orte_vpid_t non_zero=0, lowest=0; - char *msg; - - if (NULL == jdata) { - /* just check to see if the daemons are complete */ - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:default_hnp:check_job_complete - received NULL job, checking daemons", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - goto CHECK_DAEMONS; - } - - for (i=0; i < jdata->procs->size && !jdata->abort; i++) { - if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) { - /* the proc array may no longer be left justified, so - * we need to check everything - */ - continue; - } - - if (0 != proc->exit_code) { - non_zero++; - if (0 == lowest) { - lowest = proc->exit_code; - } - } - - switch (proc->state) { - case ORTE_PROC_STATE_KILLED_BY_CMD: - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:default_hnp:check_job_completed proc %s killed by cmd", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&proc->name))); - /* we ordered this proc to die, so it isn't an abnormal termination - * and we don't flag it as such - just check the remaining jobs to - * see if anyone is still alive - */ - if (jdata->num_terminated >= jdata->num_procs) { - /* this job has terminated - now we need to check to see if ALL - * the other jobs have also completed and wakeup if that is true - */ - if (!jdata->abort) { - jdata->state = ORTE_JOB_STATE_KILLED_BY_CMD; - } - } - goto CHECK_ALIVE; - break; - case ORTE_PROC_STATE_ABORTED: - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:default_hnp:check_job_completed proc %s aborted", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&proc->name))); - if (!jdata->abort) { - jdata->state = ORTE_JOB_STATE_ABORTED; - /* point to the lowest rank to cause the problem */ - jdata->aborted_proc = proc; - /* retain the object so it doesn't get free'd */ - OBJ_RETAIN(proc); - jdata->abort = true; - ORTE_UPDATE_EXIT_STATUS(proc->exit_code); - } - break; - case ORTE_PROC_STATE_FAILED_TO_START: - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr_default_hnp:check_job_completed proc %s failed to start", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&proc->name))); - if (!jdata->abort) { - jdata->state = ORTE_JOB_STATE_FAILED_TO_START; - /* point to the lowest rank to cause the problem */ - jdata->aborted_proc = proc; - /* retain the object so it doesn't get free'd */ - OBJ_RETAIN(proc); - jdata->abort = true; - ORTE_UPDATE_EXIT_STATUS(proc->exit_code); - } - break; - case ORTE_PROC_STATE_ABORTED_BY_SIG: - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:default_hnp:check_job_completed proc %s aborted by signal", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&proc->name))); - if (!jdata->abort) { - jdata->state = ORTE_JOB_STATE_ABORTED_BY_SIG; - /* point to the lowest rank to cause the problem */ - jdata->aborted_proc = proc; - /* retain the object so it doesn't get free'd */ - OBJ_RETAIN(proc); - jdata->abort = true; - ORTE_UPDATE_EXIT_STATUS(proc->exit_code); - } - break; - case ORTE_PROC_STATE_TERM_WO_SYNC: - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:default_hnp:check_job_completed proc %s terminated without sync", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&proc->name))); - if (!jdata->abort) { - jdata->state = ORTE_JOB_STATE_ABORTED_WO_SYNC; - /* point to the lowest rank to cause the problem */ - jdata->aborted_proc = proc; - /* retain the object so it doesn't get free'd */ - OBJ_RETAIN(proc); - jdata->abort = true; - ORTE_UPDATE_EXIT_STATUS(proc->exit_code); - /* now treat a special case - if the proc exit'd without a required - * sync, it may have done so with a zero exit code. We want to ensure - * that the user realizes there was an error, so in this -one- case, - * we overwrite the process' exit code with the default error code - */ - ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); - } - break; - case ORTE_PROC_STATE_COMM_FAILED: - if (!jdata->abort) { - jdata->state = ORTE_JOB_STATE_COMM_FAILED; - /* point to the lowest rank to cause the problem */ - jdata->aborted_proc = proc; - /* retain the object so it doesn't get free'd */ - OBJ_RETAIN(proc); - jdata->abort = true; - ORTE_UPDATE_EXIT_STATUS(proc->exit_code); - } - break; - case ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED: - if (!jdata->abort) { - jdata->state = ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED; - /* point to the lowest rank to cause the problem */ - jdata->aborted_proc = proc; - /* retain the object so it doesn't get free'd */ - OBJ_RETAIN(proc); - jdata->abort = true; - ORTE_UPDATE_EXIT_STATUS(proc->exit_code); - } - break; - case ORTE_PROC_STATE_CALLED_ABORT: - if (!jdata->abort) { - jdata->state = ORTE_JOB_STATE_CALLED_ABORT; - /* point to the first proc to cause the problem */ - jdata->aborted_proc = proc; - /* retain the object so it doesn't get free'd */ - OBJ_RETAIN(proc); - jdata->abort = true; - ORTE_UPDATE_EXIT_STATUS(proc->exit_code); - } - break; - case ORTE_PROC_STATE_HEARTBEAT_FAILED: - if (!jdata->abort) { - jdata->state = ORTE_JOB_STATE_HEARTBEAT_FAILED; - /* point to the lowest rank to cause the problem */ - jdata->aborted_proc = proc; - /* retain the object so it doesn't get free'd */ - OBJ_RETAIN(proc); - jdata->abort = true; - ORTE_UPDATE_EXIT_STATUS(proc->exit_code); - } - break; - case ORTE_PROC_STATE_TERM_NON_ZERO: - ORTE_UPDATE_EXIT_STATUS(proc->exit_code); - if (orte_abort_non_zero_exit) { - if (!jdata->abort) { - jdata->state = ORTE_JOB_STATE_NON_ZERO_TERM; - /* point to the lowest rank to cause the problem */ - jdata->aborted_proc = proc; - /* retain the object so it doesn't get free'd */ - OBJ_RETAIN(proc); - jdata->abort = true; - } - } - break; - - default: - if (ORTE_PROC_STATE_UNTERMINATED < proc->state && - jdata->controls & ORTE_JOB_CONTROL_CONTINUOUS_OP) { - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:default_hnp:check_job_completed proc %s terminated and continuous", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&proc->name))); - if (!jdata->abort) { - proc->state = ORTE_PROC_STATE_ABORTED; - jdata->state = ORTE_JOB_STATE_ABORTED; - /* point to the lowest rank to cause the problem */ - jdata->aborted_proc = proc; - /* retain the object so it doesn't get free'd */ - OBJ_RETAIN(proc); - jdata->abort = true; - ORTE_UPDATE_EXIT_STATUS(proc->exit_code); - } - } - break; - } - } - - if (jdata->abort) { - /* the job aborted - turn off any sensors on this job */ - orte_sensor.stop(jdata->jobid); - } - - if (ORTE_JOB_STATE_UNTERMINATED > jdata->state && - jdata->num_terminated >= jdata->num_procs) { - /* this job has terminated */ - jdata->state = ORTE_JOB_STATE_TERMINATED; - - /* turn off any sensor monitors on this job */ - orte_sensor.stop(jdata->jobid); - - if (0 < non_zero) { - if (!orte_report_child_jobs_separately || 1 == ORTE_LOCAL_JOBID(jdata->jobid)) { - /* update the exit code */ - ORTE_UPDATE_EXIT_STATUS(lowest); - } - - /* warn user */ - opal_output(orte_clean_output, - "-------------------------------------------------------\n" - "While %s job %s terminated normally, %s %s. Further examination may be required.\n" - "-------------------------------------------------------", - (1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "the primary" : "child", - (1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "" : ORTE_LOCAL_JOBID_PRINT(jdata->jobid), - ORTE_VPID_PRINT(non_zero), - (1 == non_zero) ? "process returned\na non-zero exit code." : "processes returned\nnon-zero exit codes."); - } - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:default_hnp:check_job_completed declared job %s normally terminated - checking all jobs", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(jdata->jobid))); - } - - /* if this job is a continuously operating one, then don't do - * anything further - just return here - */ - if (NULL != jdata && - (ORTE_JOB_CONTROL_CONTINUOUS_OP & jdata->controls || - ORTE_JOB_CONTROL_RECOVERABLE & jdata->controls)) { - goto CHECK_ALIVE; - } - - /* if the job that is being checked is the HNP, then we are - * trying to terminate the orteds. In that situation, we - * do -not- check all jobs - we simply notify the DEFAULT_HNP - * that the orteds are complete. Also check special case - * if jdata is NULL - we want - * to definitely declare the job done if the orteds - * have completed, no matter what else may be happening. - * This can happen if a ctrl-c hits in the "wrong" place - * while launching - */ -CHECK_DAEMONS: - if (jdata == NULL || jdata->jobid == ORTE_PROC_MY_NAME->jobid) { - if (0 == orte_routed.num_routes()) { - /* orteds are done! */ - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s orteds complete - exiting", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - if (NULL == jdata) { - jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); - } - jdata->state = ORTE_JOB_STATE_TERMINATED; - orte_quit(); - return; - } - return; - } - - /* Release the resources used by this job. Since some errmgrs may want - * to continue using resources allocated to the job as part of their - * fault recovery procedure, we only do this once the job is "complete". - * Note that an aborted/killed job -is- flagged as complete and will - * therefore have its resources released. We need to do this after - * we call the errmgr so that any attempt to restart the job will - * avoid doing so in the exact same place as the current job - */ - if (NULL != jdata->map && jdata->state == ORTE_JOB_STATE_TERMINATED) { - map = jdata->map; - for (index = 0; index < map->nodes->size; index++) { - if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, index))) { - continue; - } - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s releasing procs from node %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - node->name)); - for (i = 0; i < node->procs->size; i++) { - if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) { - continue; - } - if (proc->name.jobid != jdata->jobid) { - /* skip procs from another job */ - continue; - } - node->slots_inuse--; - node->num_procs--; - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s releasing proc %s from node %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&proc->name), node->name)); - /* set the entry in the node array to NULL */ - opal_pointer_array_set_item(node->procs, i, NULL); - /* release the proc once for the map entry */ - OBJ_RELEASE(proc); - } - } - OBJ_RELEASE(map); - jdata->map = NULL; - } - -CHECK_ALIVE: - /* now check to see if all jobs are done - release this jdata - * object when we find it - */ - one_still_alive = false; - for (j=1; j < orte_job_data->size; j++) { - if (NULL == (job = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, j))) { - /* since we are releasing jdata objects as we - * go, we can no longer assume that the job_data - * array is left justified - */ - continue; - } - /* if this is the job we are checking AND it normally terminated, - * then go ahead and release it. We cannot release it if it - * abnormally terminated as mpirun needs the info so it can - * report appropriately to the user - * - * NOTE: do not release the primary job (j=1) so we - * can pretty-print completion message - */ - if (NULL != jdata && job->jobid == jdata->jobid && - (jdata->state == ORTE_JOB_STATE_TERMINATED || - jdata->state == ORTE_JOB_STATE_KILLED_BY_CMD)) { - /* release this object, ensuring that the - * pointer array internal accounting - * is maintained! - */ - if (1 < j) { - opal_pointer_array_set_item(orte_job_data, j, NULL); /* ensure the array has a NULL */ - OBJ_RELEASE(jdata); - } - continue; - } - /* if the job is flagged to not be monitored, skip it */ - if (ORTE_JOB_CONTROL_DO_NOT_MONITOR & job->controls) { - continue; - } - /* when checking for job termination, we must be sure to NOT check - * our own job as it - rather obviously - has NOT terminated! - */ - if (job->num_terminated < job->num_procs) { - /* we have at least one job that is not done yet - we cannot - * just return, though, as we need to ensure we cleanout the - * job data for the job that just completed - */ - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:default_hnp:check_job_completed job %s is not terminated (%d:%d)", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(job->jobid), - job->num_terminated, job->num_procs)); - one_still_alive = true; - } - else { - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:default_hnp:check_job_completed job %s is terminated (%d vs %d [%s])", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(job->jobid), - job->num_terminated, job->num_procs, - (NULL == jdata) ? "UNKNOWN" : orte_job_state_to_str(jdata->state) )); - } - } - /* if a job is still alive, we just return */ - if (one_still_alive) { - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:default_hnp:check_job_completed at least one job is not terminated", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - return; - } - /* if we get here, then all jobs are done, so terminate */ - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:default_hnp:check_job_completed all jobs terminated", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - /* set the exit status to 0 - this will only happen if it - * wasn't already set by an error condition - */ - ORTE_UPDATE_EXIT_STATUS(0); - /* provide a notifier message if that framework is active - ignored otherwise */ - if (NULL != (job = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, 1))) { - if (NULL == job->name) { - job->name = strdup(orte_process_info.nodename); - } - if (NULL == job->instance) { - asprintf(&job->instance, "%d", orte_process_info.pid); - } - if (0 == orte_exit_status) { - asprintf(&msg, "Job %s:%s complete", job->name, job->instance); - orte_notifier.log(ORTE_NOTIFIER_INFO, 0, msg); - } else { - asprintf(&msg, "Job %s:%s terminated abnormally", job->name, job->instance); - orte_notifier.log(ORTE_NOTIFIER_ALERT, orte_exit_status, msg); - } - free(msg); - /* this job object will be release during finalize */ - } - - orte_jobs_complete(); - /* if I am the only daemon alive, then I can exit now */ - if (0 == orte_routed.num_routes()) { - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s orteds complete - exiting", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - orte_quit(); - } -} - -static void killprocs(orte_jobid_t job, orte_vpid_t vpid) -{ - opal_pointer_array_t cmd; - orte_proc_t proc; - int rc; - - /* stop local sensors for this job */ - if (ORTE_VPID_WILDCARD == vpid) { - orte_sensor.stop(job); - } - - if (ORTE_JOBID_WILDCARD == job - && ORTE_VPID_WILDCARD == vpid) { - - if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(NULL))) { - ORTE_ERROR_LOG(rc); - } - return; - } - - OBJ_CONSTRUCT(&cmd, opal_pointer_array_t); - OBJ_CONSTRUCT(&proc, orte_proc_t); - proc.name.jobid = job; - proc.name.vpid = vpid; - ORTE_EPOCH_SET(proc.name.epoch,orte_ess.proc_get_epoch(&(proc.name))); - opal_pointer_array_add(&cmd, &proc); - if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(&cmd))) { - ORTE_ERROR_LOG(rc); - } - OBJ_DESTRUCT(&cmd); - OBJ_DESTRUCT(&proc); -} diff --git a/orte/mca/errmgr/default_orted/configure.m4 b/orte/mca/errmgr/default_orted/configure.m4 index cfc2eb6348..7ae3aef7cc 100644 --- a/orte/mca/errmgr/default_orted/configure.m4 +++ b/orte/mca/errmgr/default_orted/configure.m4 @@ -13,7 +13,7 @@ AC_DEFUN([MCA_orte_errmgr_default_orted_CONFIG], [ AC_CONFIG_FILES([orte/mca/errmgr/default_orted/Makefile]) - AS_IF([test "$orte_enable_resilient_code" = 0 -a "$orte_without_full_support" = 0], + AS_IF([test "$orte_without_full_support" = 0], [$1], [$2]) ]) diff --git a/orte/mca/errmgr/default_orted/errmgr_default_orted.c b/orte/mca/errmgr/default_orted/errmgr_default_orted.c index 810728476e..2ed26e8b73 100644 --- a/orte/mca/errmgr/default_orted/errmgr_default_orted.c +++ b/orte/mca/errmgr/default_orted/errmgr_default_orted.c @@ -6,6 +6,8 @@ * Copyright (c) 2004-2011 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -32,8 +34,7 @@ #include "orte/util/session_dir.h" #include "orte/util/show_help.h" #include "orte/util/nidmap.h" -#include "orte/runtime/orte_globals.h" -#include "orte/runtime/data_type_support/orte_dt_support.h" + #include "orte/mca/rml/rml.h" #include "orte/mca/odls/odls.h" #include "orte/mca/odls/base/base.h" @@ -42,8 +43,11 @@ #include "orte/mca/routed/routed.h" #include "orte/mca/sensor/sensor.h" #include "orte/mca/ess/ess.h" +#include "orte/mca/state/state.h" + #include "orte/runtime/orte_quit.h" #include "orte/runtime/orte_globals.h" +#include "orte/runtime/data_type_support/orte_dt_support.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/errmgr/base/base.h" @@ -51,18 +55,6 @@ #include "errmgr_default_orted.h" -/* Local functions */ -static bool any_live_children(orte_jobid_t job); -static int pack_state_update(opal_buffer_t *alert, orte_odls_job_t *jobdat); -static int pack_state_for_proc(opal_buffer_t *alert, orte_odls_child_t *child); -static bool all_children_registered(orte_jobid_t job); -static int pack_child_contact_info(orte_jobid_t job, opal_buffer_t *buf); -static void failed_start(orte_odls_job_t *jobdat, orte_exit_code_t exit_code); -static void update_local_children(orte_odls_job_t *jobdat, - orte_job_state_t jobstate, - orte_proc_state_t state); -static void killprocs(orte_jobid_t job, orte_vpid_t vpid); - /* * Module functions: Global */ @@ -73,13 +65,6 @@ static int predicted_fault(opal_list_t *proc_list, opal_list_t *node_list, opal_list_t *suggested_map); -static int update_state(orte_jobid_t job, - orte_job_state_t jobstate, - orte_process_name_t *proc, - orte_proc_state_t state, - pid_t pid, - orte_exit_code_t exit_code); - static int suggest_map_targets(orte_proc_t *proc, orte_node_t *oldnode, opal_list_t *node_list); @@ -96,7 +81,6 @@ orte_errmgr_base_module_t orte_errmgr_default_orted_module = { orte_errmgr_base_log, orte_errmgr_base_abort, orte_errmgr_base_abort_peers, - update_state, predicted_fault, suggest_map_targets, ft_event, @@ -104,11 +88,32 @@ orte_errmgr_base_module_t orte_errmgr_default_orted_module = { NULL }; +/* Local functions */ +static bool any_live_children(orte_jobid_t job); +static int pack_state_update(opal_buffer_t *alert, orte_job_t *jobdat); +static int pack_state_for_proc(opal_buffer_t *alert, orte_proc_t *child); +static bool all_children_registered(orte_jobid_t job); +static int pack_child_contact_info(orte_jobid_t job, opal_buffer_t *buf); +static void failed_start(orte_job_t *jobdat); +static void update_local_children(orte_job_t *jobdat, + orte_job_state_t jobstate, + orte_proc_state_t state); +static void killprocs(orte_jobid_t job, orte_vpid_t vpid); + +static void job_errors(int fd, short args, void *cbdata); +static void proc_errors(int fd, short args, void *cbdata); + /************************ * API Definitions ************************/ static int init(void) { + /* setup state machine to trap job errors */ + orte_state.add_job_state(ORTE_JOB_STATE_ERROR, job_errors, ORTE_ERROR_PRI); + + /* setup state machine to trap proc errors */ + orte_state.add_proc_state(ORTE_PROC_STATE_ERROR, proc_errors, ORTE_ERROR_PRI); + return ORTE_SUCCESS; } @@ -117,138 +122,125 @@ static int finalize(void) return ORTE_SUCCESS; } -static void cbfunc(int status, orte_process_name_t* sender, - opal_buffer_t *buffer, orte_rml_tag_t tag, - void* cbdata) +static void job_errors(int fd, short args, void *cbdata) { - OBJ_RELEASE(buffer); -} - -static int update_state(orte_jobid_t job, - orte_job_state_t jobstate, - orte_process_name_t *proc, - orte_proc_state_t state, - pid_t pid, - orte_exit_code_t exit_code) -{ - opal_list_item_t *item, *next; - orte_odls_job_t *jobdat = NULL; - orte_odls_child_t *child; - opal_buffer_t *alert; + orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; + orte_job_t *jdata; + orte_job_state_t jobstate; + int rc; orte_plm_cmd_flag_t cmd; - int rc=ORTE_SUCCESS; - orte_vpid_t null=ORTE_VPID_INVALID; - orte_ns_cmp_bitmask_t mask; + opal_buffer_t *alert; /* * if orte is trying to shutdown, just let it */ if (orte_finalizing) { - return ORTE_SUCCESS; + return; + } + + /* if the jdata is NULL, then we abort as this + * is reporting an unrecoverable error + */ + if (NULL == caddy->jdata) { + ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_FORCED_EXIT); + OBJ_RELEASE(caddy); + return; + } + + /* update the state */ + jdata = caddy->jdata; + jobstate = caddy->job_state; + jdata->state = jobstate; + + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, + "%s errmgr:default_orted: job %s reported error state %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(jdata->jobid), + orte_job_state_to_str(jobstate))); + + switch (jobstate) { + case ORTE_JOB_STATE_FAILED_TO_START: + failed_start(jdata); + break; + case ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED: + /* update all procs in job */ + update_local_children(jdata, jobstate, ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED); + /* order all local procs for this job to be killed */ + killprocs(jdata->jobid, ORTE_VPID_WILDCARD); + break; + case ORTE_JOB_STATE_COMM_FAILED: + /* kill all local procs */ + killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); + /* order termination */ + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); + goto cleanup; + break; + case ORTE_JOB_STATE_HEARTBEAT_FAILED: + /* let the HNP handle this */ + goto cleanup; + break; + + default: + break; + } + alert = OBJ_NEW(opal_buffer_t); + /* pack update state command */ + cmd = ORTE_PLM_UPDATE_PROC_STATE; + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(alert); + goto cleanup; + } + /* pack the job info */ + if (ORTE_SUCCESS != (rc = pack_state_update(alert, jdata))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(alert); + goto cleanup; + } + /* send it */ + if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, + ORTE_RML_TAG_PLM, 0, + orte_rml_send_callback, NULL))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(alert); + } + + cleanup: + OBJ_RELEASE(caddy); +} + +static void proc_errors(int fd, short args, void *cbdata) +{ + orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; + orte_job_t *jdata; + orte_proc_t *pptr; + orte_process_name_t *proc = &caddy->name; + orte_proc_state_t state = caddy->proc_state; + + orte_proc_t *child, *ptr; + opal_buffer_t *alert; + orte_plm_cmd_flag_t cmd; + int rc=ORTE_SUCCESS; + orte_vpid_t null=ORTE_VPID_INVALID; + orte_ns_cmp_bitmask_t mask=ORTE_NS_CMP_ALL; + int i, nchildren; + + /* + * if orte is trying to shutdown, just let it + */ + if (orte_finalizing) { + goto cleanup; } OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, - "%s errmgr:default_orted:update_state process %s to %s", + "%s errmgr:default_orted:proc_errors process %s error state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - (NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc), + ORTE_NAME_PRINT(proc), orte_proc_state_to_str(state))); /* if this is a heartbeat failure, let the HNP handle it */ - if (ORTE_JOB_STATE_HEARTBEAT_FAILED == jobstate || - ORTE_PROC_STATE_HEARTBEAT_FAILED == state) { - return ORTE_SUCCESS; - } - - /*** UPDATE COMMAND FOR A JOB ***/ - if (NULL == proc) { - /* this is an update for an entire job */ - if (ORTE_JOBID_INVALID == job) { - /* whatever happened, we don't know what job - * it happened to - */ - orte_show_help("help-orte-errmgr.txt", "errmgr:unknown-job-error", - true, orte_job_state_to_str(jobstate)); - alert = OBJ_NEW(opal_buffer_t); - /* pack update state command */ - cmd = ORTE_PLM_UPDATE_PROC_STATE; - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* pack the "invalid" jobid */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &job, 1, ORTE_JOBID))) { - ORTE_ERROR_LOG(rc); - return rc; - } - if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, 0, cbfunc, NULL))) { - ORTE_ERROR_LOG(rc); - } else { - rc = ORTE_SUCCESS; - } - return rc; - } - - /* lookup the local jobdat for this job */ - jobdat = NULL; - for (item = opal_list_get_first(&orte_local_jobdata); - item != opal_list_get_end(&orte_local_jobdata); - item = opal_list_get_next(item)) { - jobdat = (orte_odls_job_t*)item; - - /* is this the specified job? */ - if (jobdat->jobid == job) { - break; - } - } - if (NULL == jobdat) { - return ORTE_ERR_NOT_FOUND; - } - - switch (jobstate) { - case ORTE_JOB_STATE_FAILED_TO_START: - failed_start(jobdat, exit_code); - break; - case ORTE_JOB_STATE_RUNNING: - /* update all local child states */ - update_local_children(jobdat, jobstate, ORTE_PROC_STATE_RUNNING); - break; - case ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED: - /* update all procs in job */ - update_local_children(jobdat, jobstate, ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED); - /* order all local procs for this job to be killed */ - killprocs(jobdat->jobid, ORTE_VPID_WILDCARD); - case ORTE_JOB_STATE_COMM_FAILED: - /* kill all local procs */ - killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); - /* tell the caller we can't recover */ - return ORTE_ERR_UNRECOVERABLE; - break; - case ORTE_JOB_STATE_HEARTBEAT_FAILED: - /* let the HNP handle this */ - return ORTE_SUCCESS; - break; - - default: - break; - } - alert = OBJ_NEW(opal_buffer_t); - /* pack update state command */ - cmd = ORTE_PLM_UPDATE_PROC_STATE; - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { - ORTE_ERROR_LOG(rc); - goto FINAL_CLEANUP; - } - /* pack the job info */ - if (ORTE_SUCCESS != (rc = pack_state_update(alert, jobdat))) { - ORTE_ERROR_LOG(rc); - } - /* send it */ - if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, 0, cbfunc, NULL))) { - ORTE_ERROR_LOG(rc); - } else { - rc = ORTE_SUCCESS; - } - return rc; + if (ORTE_PROC_STATE_HEARTBEAT_FAILED == state) { + goto cleanup; } /* if this was a failed comm, then see if it was to our @@ -257,12 +249,12 @@ static int update_state(orte_jobid_t job, if (ORTE_PROC_STATE_COMM_FAILED == state) { /* if it is our own connection, ignore it */ if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_NAME, proc)) { - return ORTE_SUCCESS; + goto cleanup; } /* was it a daemon? */ if (proc->jobid != ORTE_PROC_MY_NAME->jobid) { /* nope - ignore */ - return ORTE_SUCCESS; + goto cleanup; } OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, "%s errmgr:default:orted daemon %s exited", @@ -279,79 +271,76 @@ static int update_state(orte_jobid_t job, /* terminate - our routed children will see * us leave and automatically die */ - orte_quit(); + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); + goto cleanup; } - /* was it a daemon that failed? */ - if (proc->jobid == ORTE_PROC_MY_NAME->jobid) { - /* if all my routes are gone, then terminate ourselves */ - if (0 == orte_routed.num_routes() && - 0 == opal_list_get_size(&orte_local_children)) { - OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, - "%s errmgr:default:orted all routes gone - exiting", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - orte_quit(); - } else { - OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, - "%s errmgr:default:orted not exiting, num_routes() == %d, num children == %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - (int)orte_routed.num_routes(), - (int)opal_list_get_size(&orte_local_children))); + /* are any of my children still alive */ + for (i=0; i < orte_local_children->size; i++) { + if (NULL != (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { + if (child->alive && child->state < ORTE_PROC_STATE_UNTERMINATED) { + goto cleanup; + } } } - /* if not, then indicate we can continue */ - return ORTE_SUCCESS; - } - - /* lookup the local jobdat for this job */ - jobdat = NULL; - for (item = opal_list_get_first(&orte_local_jobdata); - item != opal_list_get_end(&orte_local_jobdata); - item = opal_list_get_next(item)) { - jobdat = (orte_odls_job_t*)item; - - /* is this the specified job? */ - if (jobdat->jobid == proc->jobid) { - break; + /* if all my routes and children are gone, then terminate ourselves */ + if (0 == orte_routed.num_routes()) { + OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, + "%s errmgr:default:orted all routes gone - exiting", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); + } else { + OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, + "%s errmgr:default:orted not exiting, num_routes() == %d, num_children == %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (int)orte_routed.num_routes(), nchildren)); } + /* if not, then we can continue */ + goto cleanup; } - if (NULL == jobdat) { + + /* get the job object */ + if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) { /* must already be complete */ - return ORTE_SUCCESS; + goto cleanup; } + pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid); /* if there are no local procs for this job, we can * ignore this call */ - if (0 == jobdat->num_local_procs) { - return ORTE_SUCCESS; + if (0 == jdata->num_local_procs) { + goto cleanup; + } + + /* find this proc in the local children */ + child = NULL; + for (i=0; i < orte_local_children->size; i++) { + if (NULL == (ptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { + continue; + } + if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &ptr->name, proc)) { + child = ptr; + break; + } + } + if (NULL == child) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + goto cleanup; } OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, - "%s errmgr:default_orted got state %s for proc %s pid %d", + "%s errmgr:default_orted got state %s for proc %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), orte_proc_state_to_str(state), - ORTE_NAME_PRINT(proc), pid)); + ORTE_NAME_PRINT(proc))); - /*** UPDATE COMMAND FOR A SPECIFIC PROCESS ***/ if (ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED == state) { - /* find this proc in the local children */ - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; - mask = ORTE_NS_CMP_ALL; - if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) { - if (ORTE_PROC_STATE_UNTERMINATED > child->state) { - child->state = state; - child->exit_code = exit_code; - /* Decrement the number of local procs */ - jobdat->num_local_procs--; - /* kill this proc */ - killprocs(proc->jobid, proc->vpid); - } - return ORTE_SUCCESS; - } - } + child->state = state; + /* Decrement the number of local procs */ + jdata->num_local_procs--; + /* kill this proc */ + killprocs(proc->jobid, proc->vpid); + goto cleanup; } if (ORTE_PROC_STATE_TERM_NON_ZERO == state) { @@ -361,6 +350,21 @@ static int update_state(orte_jobid_t job, } } + if (ORTE_PROC_STATE_FAILED_TO_START == state || + ORTE_PROC_STATE_FAILED_TO_LAUNCH == state) { + /* update the proc state */ + child->state = state; + /* count the proc as having "terminated" */ + jdata->num_terminated++; + /* leave the error report in this case to the + * state machine, which will receive notice + * when all local procs have attempted to start + * so that we send a consolidated error report + * back to the HNP + */ + goto cleanup; + } + if (ORTE_PROC_STATE_TERMINATED < state) { /* if the job hasn't completed and the state is abnormally * terminated, then we need to alert the HNP right away @@ -370,79 +374,46 @@ static int update_state(orte_jobid_t job, cmd = ORTE_PLM_UPDATE_PROC_STATE; if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); - goto FINAL_CLEANUP; + return; } /* pack only the data for this proc - have to start with the jobid * so the receiver can unpack it correctly */ if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); - return rc; + return; } - /* find this proc in the local children */ - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; - mask = ORTE_NS_CMP_ALL; - if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) { - if (ORTE_PROC_STATE_UNTERMINATED > child->state) { - child->state = state; - child->exit_code = exit_code; - } - /* now pack the child's info */ - if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* remove the child from our local list as it is no longer alive */ - opal_list_remove_item(&orte_local_children, &child->super); - /* Decrement the number of local procs */ - jobdat->num_local_procs--; - - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:default_orted reporting proc %s aborted to HNP (local procs = %d)", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name), - jobdat->num_local_procs)); - - /* release the child object */ - OBJ_RELEASE(child); - /* done with loop */ - break; - } + child->state = state; + /* now pack the child's info */ + if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) { + ORTE_ERROR_LOG(rc); + return; } + /* remove the child from our local array as it is no longer alive */ + opal_pointer_array_set_item(orte_local_children, i, NULL); + /* Decrement the number of local procs */ + jdata->num_local_procs--; + + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:default_orted reporting proc %s aborted to HNP (local procs = %d)", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&child->name), + jdata->num_local_procs)); + + /* release the child object */ + OBJ_RELEASE(child); /* send it */ - if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, 0, cbfunc, NULL))) { + if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, + ORTE_RML_TAG_PLM, 0, + orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); - } else { - rc = ORTE_SUCCESS; } - return rc; + return; } REPORT_STATE: - /* find this proc in the local children so we can update its state */ - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; - mask = ORTE_NS_CMP_ALL; - if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) { - if (ORTE_PROC_STATE_UNTERMINATED > child->state) { - child->state = state; - if (0 < pid) { - child->pid = pid; - } - child->exit_code = exit_code; - } - /* done with loop */ - break; - } - } - if (ORTE_PROC_STATE_REGISTERED == state) { /* see if everyone in this job has registered */ if (all_children_registered(proc->jobid)) { @@ -460,116 +431,97 @@ static int update_state(orte_jobid_t job, cmd = ORTE_PLM_INIT_ROUTES_CMD; if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); - goto FINAL_CLEANUP; + return; } /* pack the jobid */ if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); - goto FINAL_CLEANUP; + return; } - /* pack all the local child vpids and epochs */ - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; - if (child->name->jobid == proc->jobid) { - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &child->name->vpid, 1, ORTE_VPID))) { + /* pack all the local child vpids */ + for (i=0; i < orte_local_children->size; i++) { + if (NULL == (ptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { + continue; + } + if (ptr->name.jobid == proc->jobid) { + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &ptr->name.vpid, 1, ORTE_VPID))) { ORTE_ERROR_LOG(rc); - goto FINAL_CLEANUP; + return; } } } /* pack an invalid marker */ if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &null, 1, ORTE_VPID))) { ORTE_ERROR_LOG(rc); - goto FINAL_CLEANUP; + return; } /* add in contact info for all procs in the job */ if (ORTE_SUCCESS != (rc = pack_child_contact_info(proc->jobid, alert))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&alert); - return rc; + return; } /* send it */ - if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, 0, cbfunc, NULL))) { + if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, + ORTE_RML_TAG_PLM, 0, + orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); - } else { - rc = ORTE_SUCCESS; } } - return rc; + return; } /* only other state is terminated - see if anyone is left alive */ if (!any_live_children(proc->jobid)) { - /* lookup the local jobdat for this job */ - jobdat = NULL; - for (item = opal_list_get_first(&orte_local_jobdata); - item != opal_list_get_end(&orte_local_jobdata); - item = opal_list_get_next(item)) { - jobdat = (orte_odls_job_t*)item; - - /* is this the specified job? */ - if (jobdat->jobid == proc->jobid) { - break; - } - } - if (NULL == jobdat) { - /* race condition - may not have been formed yet */ - return ORTE_SUCCESS; - } - alert = OBJ_NEW(opal_buffer_t); /* pack update state command */ cmd = ORTE_PLM_UPDATE_PROC_STATE; if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); - goto FINAL_CLEANUP; + return; } /* pack the data for the job */ - if (ORTE_SUCCESS != (rc = pack_state_update(alert, jobdat))) { + if (ORTE_SUCCESS != (rc = pack_state_update(alert, jdata))) { ORTE_ERROR_LOG(rc); + return; } - FINAL_CLEANUP: OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s errmgr:default_orted reporting all procs in %s terminated", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(jobdat->jobid))); + ORTE_JOBID_PRINT(jdata->jobid))); /* remove all of this job's children from the global list - do not lock * the thread as we are already locked */ - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = next) { - child = (orte_odls_child_t*)item; - next = opal_list_get_next(item); - - if (jobdat->jobid == child->name->jobid) { - opal_list_remove_item(&orte_local_children, &child->super); - OBJ_RELEASE(child); + for (i=0; i < orte_local_children->size; i++) { + if (NULL == (ptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { + continue; + } + if (jdata->jobid == ptr->name.jobid) { + opal_pointer_array_set_item(orte_local_children, i, NULL); + OBJ_RELEASE(ptr); } } /* ensure the job's local session directory tree is removed */ - orte_session_dir_cleanup(jobdat->jobid); + orte_session_dir_cleanup(jdata->jobid); /* remove this job from our local job data since it is complete */ - opal_list_remove_item(&orte_local_jobdata, &jobdat->super); - OBJ_RELEASE(jobdat); + opal_pointer_array_set_item(orte_job_data, ORTE_LOCAL_JOBID(jdata->jobid), NULL); + OBJ_RELEASE(jdata); /* send it */ - if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, 0, cbfunc, NULL))) { + if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, + ORTE_RML_TAG_PLM, 0, + orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); - } else { - rc = ORTE_SUCCESS; } - - /* indicate that the job is complete */ - return rc; + return; } - return ORTE_SUCCESS; + + cleanup: + OBJ_RELEASE(caddy); } static int predicted_fault(opal_list_t *proc_list, @@ -597,18 +549,15 @@ static int ft_event(int state) *****************/ static bool any_live_children(orte_jobid_t job) { - opal_list_item_t *item; - orte_odls_child_t *child; - - /* the thread is locked elsewhere - don't try to do it again here */ - - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; + int i; + orte_proc_t *child; + for (i=0; i < orte_local_children->size; i++) { + if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { + continue; + } /* is this child part of the specified job? */ - if ((job == child->name->jobid || ORTE_JOBID_WILDCARD == job) && + if ((job == child->name.jobid || ORTE_JOBID_WILDCARD == job) && child->alive) { return true; } @@ -619,12 +568,12 @@ static bool any_live_children(orte_jobid_t job) } -static int pack_state_for_proc(opal_buffer_t *alert, orte_odls_child_t *child) +static int pack_state_for_proc(opal_buffer_t *alert, orte_proc_t *child) { int rc; /* pack the child's vpid */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &(child->name->vpid), 1, ORTE_VPID))) { + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &(child->name.vpid), 1, ORTE_VPID))) { ORTE_ERROR_LOG(rc); return rc; } @@ -633,20 +582,6 @@ static int pack_state_for_proc(opal_buffer_t *alert, orte_odls_child_t *child) ORTE_ERROR_LOG(rc); return rc; } - /* if we are timing things, pack the time the proc was launched */ - if (orte_timing) { - int64_t tmp; - tmp = child->starttime.tv_sec; - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) { - ORTE_ERROR_LOG(rc); - return rc; - } - tmp = child->starttime.tv_usec; - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) { - ORTE_ERROR_LOG(rc); - return rc; - } - } /* pack its state */ if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &child->state, 1, ORTE_PROC_STATE))) { ORTE_ERROR_LOG(rc); @@ -661,11 +596,10 @@ static int pack_state_for_proc(opal_buffer_t *alert, orte_odls_child_t *child) return ORTE_SUCCESS; } -static int pack_state_update(opal_buffer_t *alert, orte_odls_job_t *jobdat) +static int pack_state_update(opal_buffer_t *alert, orte_job_t *jobdat) { - int rc; - opal_list_item_t *item, *next; - orte_odls_child_t *child; + int rc, i; + orte_proc_t *child; orte_vpid_t null=ORTE_VPID_INVALID; /* pack the jobid */ @@ -673,27 +607,12 @@ static int pack_state_update(opal_buffer_t *alert, orte_odls_job_t *jobdat) ORTE_ERROR_LOG(rc); return rc; } - /* if we are timing things, pack the time the launch msg for this job was recvd */ - if (orte_timing) { - int64_t tmp; - tmp = jobdat->launch_msg_recvd.tv_sec; - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) { - ORTE_ERROR_LOG(rc); - return rc; + for (i=0; i < orte_local_children->size; i++) { + if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { + continue; } - tmp = jobdat->launch_msg_recvd.tv_usec; - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) { - ORTE_ERROR_LOG(rc); - return rc; - } - } - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = next) { - child = (orte_odls_child_t*)item; - next = opal_list_get_next(item); /* if this child is part of the job... */ - if (child->name->jobid == jobdat->jobid) { + if (child->name.jobid == jobdat->jobid) { if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) { ORTE_ERROR_LOG(rc); return rc; @@ -711,18 +630,15 @@ static int pack_state_update(opal_buffer_t *alert, orte_odls_job_t *jobdat) static bool all_children_registered(orte_jobid_t job) { - opal_list_item_t *item; - orte_odls_child_t *child; - - /* the thread is locked elsewhere - don't try to do it again here */ - - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; + int i; + orte_proc_t *child; + for (i=0; i < orte_local_children->size; i++) { + if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { + continue; + } /* is this child part of the specified job? */ - if (OPAL_EQUAL == opal_dss.compare(&child->name->jobid, &job, ORTE_JOBID)) { + if (job == child->name.jobid || ORTE_JOBID_WILDCARD == job) { /* if this child has terminated, we consider it as having * registered for the purposes of this function. If it never * did register, then we will send a NULL rml_uri back to @@ -736,39 +652,31 @@ static bool all_children_registered(orte_jobid_t job) */ continue; } - /* if this child is *not* registered yet, return false */ - if (!child->init_recvd) { - return false; - } - /* if this child has registered a finalize, return false */ - if (child->fini_recvd) { + /* if this child has *not* registered yet, return false */ + if (!child->registered) { return false; } } } - /* if we get here, then everyone in the job is currently registered */ + /* if we get here, then everyone in the job has registered */ return true; } static int pack_child_contact_info(orte_jobid_t job, opal_buffer_t *buf) { - opal_list_item_t *item; - orte_odls_child_t *child; - int rc; - - /* the thread is locked elsewhere - don't try to do it again here */ - - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; + orte_proc_t *child; + int rc, i; + for (i=0; i < orte_local_children->size; i++) { + if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { + continue; + } /* is this child part of the specified job? */ - if (OPAL_EQUAL == opal_dss.compare(&child->name->jobid, &job, ORTE_JOBID)) { + if (job == child->name.jobid || ORTE_JOBID_WILDCARD == job) { /* pack the child's vpid - must be done in case rml_uri is NULL */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &(child->name->vpid), 1, ORTE_VPID))) { + if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &(child->name.vpid), 1, ORTE_VPID))) { ORTE_ERROR_LOG(rc); return rc; } @@ -784,21 +692,21 @@ static int pack_child_contact_info(orte_jobid_t job, opal_buffer_t *buf) } -static void failed_start(orte_odls_job_t *jobdat, orte_exit_code_t exit_code) +static void failed_start(orte_job_t *jobdat) { - opal_list_item_t *item; - orte_odls_child_t *child; + int i; + orte_proc_t *child; /* set the state */ jobdat->state = ORTE_JOB_STATE_FAILED_TO_START; - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; - if (child->name->jobid == jobdat->jobid) { - if (ORTE_PROC_STATE_LAUNCHED > child->state || - ORTE_PROC_STATE_FAILED_TO_START == child->state) { + for (i=0; i < orte_local_children->size; i++) { + if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { + continue; + } + /* is this child part of the specified job? */ + if (child->name.jobid == jobdat->jobid) { + if (ORTE_PROC_STATE_FAILED_TO_START == child->state) { /* this proc never launched - flag that the iof * is complete or else we will hang waiting for * pipes to close that were never opened @@ -816,19 +724,20 @@ static void failed_start(orte_odls_job_t *jobdat, orte_exit_code_t exit_code) return; } -static void update_local_children(orte_odls_job_t *jobdat, orte_job_state_t jobstate, orte_proc_state_t state) +static void update_local_children(orte_job_t *jobdat, orte_job_state_t jobstate, orte_proc_state_t state) { - opal_list_item_t *item; - orte_odls_child_t *child; + int i; + orte_proc_t *child; /* update job state */ jobdat->state = jobstate; /* update children */ - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; - if (jobdat->jobid == child->name->jobid) { + for (i=0; i < orte_local_children->size; i++) { + if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { + continue; + } + /* is this child part of the specified job? */ + if (jobdat->jobid == child->name.jobid) { child->state = state; } } @@ -857,7 +766,6 @@ static void killprocs(orte_jobid_t job, orte_vpid_t vpid) OBJ_CONSTRUCT(&proc, orte_proc_t); proc.name.jobid = job; proc.name.vpid = vpid; - ORTE_EPOCH_SET(proc.name.epoch,orte_ess.proc_get_epoch(&(proc.name))); opal_pointer_array_add(&cmd, &proc); if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(&cmd))) { ORTE_ERROR_LOG(rc); diff --git a/orte/mca/errmgr/errmgr.h b/orte/mca/errmgr/errmgr.h index 24a6de292a..2d7348c6f0 100644 --- a/orte/mca/errmgr/errmgr.h +++ b/orte/mca/errmgr/errmgr.h @@ -11,6 +11,8 @@ * All rights reserved. * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -204,27 +206,6 @@ __opal_attribute_format_funcptr__(__printf__, 2, 3); typedef int (*orte_errmgr_base_module_abort_peers_fn_t)(orte_process_name_t *procs, orte_std_cntr_t num_procs); -/** - * Alert - process aborted - * This function is called by the PLM when a remote process aborts during execution. Actions taken - * in response to the abnormal termination of a remote application process will vary across - * the various errmgr components. - * - * NOTE: Local process errors should always be reported through the error_detected interface and - * NOT here. - * - * @param *name Pointer to the name of the proc that aborted - * - * @retval ORTE_SUCCESS Whatever action that was taken was successful - * @retval ORTE_ERROR Appropriate error code - */ -typedef int (*orte_errmgr_base_module_update_state_fn_t)(orte_jobid_t job, - orte_job_state_t jobstate, - orte_process_name_t *proc_name, - orte_proc_state_t state, - pid_t pid, - orte_exit_code_t exit_code); - /** * Predicted process/node failure notification * @@ -294,8 +275,6 @@ struct orte_errmgr_base_module_2_3_0_t { orte_errmgr_base_module_abort_fn_t abort; orte_errmgr_base_module_abort_peers_fn_t abort_peers; - /** Actual process failure notification */ - orte_errmgr_base_module_update_state_fn_t update_state; /** Predicted process/node failure notification */ orte_errmgr_base_module_predicted_fault_fn_t predicted_fault; /** Suggest a node to map a restarting process onto */ diff --git a/orte/mca/errmgr/hnp/Makefile.am b/orte/mca/errmgr/hnp/Makefile.am deleted file mode 100644 index db6b1a6a0f..0000000000 --- a/orte/mca/errmgr/hnp/Makefile.am +++ /dev/null @@ -1,38 +0,0 @@ -# -# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -dist_pkgdata_DATA = help-orte-errmgr-hnp.txt - -sources = \ - errmgr_hnp.h \ - errmgr_hnp_component.c \ - errmgr_hnp.c \ - errmgr_hnp_autor.c \ - errmgr_hnp_crmig.c - -# Make the output library in this directory, and name it either -# mca__.la (for DSO builds) or libmca__.la -# (for static builds). - -if MCA_BUILD_orte_errmgr_hnp_DSO -component_noinst = -component_install = mca_errmgr_hnp.la -else -component_noinst = libmca_errmgr_hnp.la -component_install = -endif - -mcacomponentdir = $(pkglibdir) -mcacomponent_LTLIBRARIES = $(component_install) -mca_errmgr_hnp_la_SOURCES = $(sources) -mca_errmgr_hnp_la_LDFLAGS = -module -avoid-version - -noinst_LTLIBRARIES = $(component_noinst) -libmca_errmgr_hnp_la_SOURCES =$(sources) -libmca_errmgr_hnp_la_LDFLAGS = -module -avoid-version diff --git a/orte/mca/errmgr/hnp/errmgr_hnp.c b/orte/mca/errmgr/hnp/errmgr_hnp.c deleted file mode 100644 index 5f6e766cd5..0000000000 --- a/orte/mca/errmgr/hnp/errmgr_hnp.c +++ /dev/null @@ -1,2182 +0,0 @@ -/* - * Copyright (c) 2009-2011 The Trustees of Indiana University. - * All rights reserved. - * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved. - * Copyright (c) 2004-2011 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2011 Los Alamos National Security, LLC. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "orte_config.h" - -#include -#ifdef HAVE_UNISTD_H -#include -#endif /* HAVE_UNISTD_H */ -#ifdef HAVE_STRING_H -#include -#endif -#ifdef HAVE_SYS_WAIT_H -#include -#endif - -#include "opal/util/output.h" -#include "opal/dss/dss.h" - -#include "orte/mca/rml/rml.h" -#include "orte/mca/odls/odls.h" -#include "orte/mca/odls/base/base.h" -#include "orte/mca/odls/base/odls_private.h" -#include "orte/mca/plm/base/plm_private.h" -#include "orte/mca/plm/plm.h" -#include "orte/mca/rmaps/rmaps_types.h" -#include "orte/mca/sensor/sensor.h" -#include "orte/mca/routed/routed.h" -#include "orte/mca/debugger/base/base.h" -#include "orte/mca/notifier/notifier.h" -#include "orte/mca/grpcomm/grpcomm.h" -#include "orte/mca/ess/ess.h" - -#include "orte/util/error_strings.h" -#include "orte/util/name_fns.h" -#include "orte/util/proc_info.h" -#include "orte/util/show_help.h" -#include "orte/util/nidmap.h" - -#include "orte/runtime/orte_globals.h" -#include "orte/runtime/orte_locks.h" -#include "orte/runtime/orte_quit.h" -#include "orte/runtime/data_type_support/orte_dt_support.h" - -#include "orte/mca/errmgr/errmgr.h" -#include "orte/mca/errmgr/base/base.h" -#include "orte/mca/errmgr/base/errmgr_private.h" - -#include "errmgr_hnp.h" - -/********************** - * C/R Mgr Components - * Global: HNP - **********************/ -static orte_errmgr_base_module_t global_module = { - /** Initialization Function */ - orte_errmgr_hnp_global_module_init, - /** Finalization Function */ - orte_errmgr_hnp_global_module_finalize, - /** Error Log */ - orte_errmgr_base_log, - /** Forced Abort */ - orte_errmgr_base_abort, - /** Peer Force Abort */ - orte_errmgr_base_abort_peers, - /** Update State */ - orte_errmgr_hnp_global_update_state, - /* Predicted Fault */ - orte_errmgr_hnp_global_predicted_fault, - /* Suggest proc to node mapping */ - orte_errmgr_hnp_global_suggest_map_targets, - /* FT Event hook */ - orte_errmgr_hnp_global_ft_event, - orte_errmgr_base_register_migration_warning -#if ORTE_RESIL_ORTE - /* Set the callback */ - ,orte_errmgr_base_set_fault_callback -#endif -}; - - -/* - * Local functions - */ -static void hnp_abort(orte_jobid_t job, orte_exit_code_t exit_code); -static void failed_start(orte_job_t *jdata); -static void update_local_procs_in_job(orte_job_t *jdata, orte_job_state_t jobstate, - orte_proc_state_t state, orte_exit_code_t exit_code); -static void check_job_complete(orte_job_t *jdata); -static void killprocs(orte_jobid_t job, orte_vpid_t vpid); -static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc, - orte_proc_state_t state, orte_exit_code_t exit_code); -static orte_odls_child_t* proc_is_local(orte_process_name_t *proc); -#if ORTE_RESIL_ORTE -static int send_to_local_applications(opal_pointer_array_t *dead_names); -static void failure_notification(int status, orte_process_name_t* sender, - opal_buffer_t *buffer, orte_rml_tag_t tag, - void* cbdata); -#endif - -/************************ - * API Definitions - ************************/ -int orte_errmgr_hnp_component_query(mca_base_module_t **module, int *priority) -{ - opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp:component_query()"); - - if( ORTE_PROC_IS_HNP ) { - *priority = mca_errmgr_hnp_component.super.priority; - *module = (mca_base_module_t *)&global_module; - } - /* Daemons and Apps have their own components */ - else { - *module = NULL; - *priority = -1; - } - - return ORTE_SUCCESS; -} - -/******************* - * Global Functions - ********************/ -int orte_errmgr_hnp_global_module_init(void) -{ - int ret, exit_status = ORTE_SUCCESS; - -#if OPAL_ENABLE_FT_CR - if( mca_errmgr_hnp_component.crmig_enabled ) { - if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_crmig_global_module_init()) ) { - exit_status = ret; - goto cleanup; - } - } - else { - /* Still need the tool listener so we can tell it that we cannot do - * anything if they ask. - */ - if( ORTE_SUCCESS != (ret = orte_errmgr_base_tool_init()) ) { - ORTE_ERROR_LOG(ret); - return ret; - } - } - - if( mca_errmgr_hnp_component.autor_enabled ) { - if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_autor_global_module_init()) ) { - exit_status = ret; - goto cleanup; - } - } -#endif /* OPAL_ENABLE_FT_CR */ - - if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_base_global_init()) ) { - exit_status = ret; - goto cleanup; - } - -cleanup: - return exit_status; -} - -int orte_errmgr_hnp_global_module_finalize(void) -{ - int ret, exit_status = ORTE_SUCCESS; - -#if OPAL_ENABLE_FT_CR - if( mca_errmgr_hnp_component.crmig_enabled ) { - if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_crmig_global_module_finalize()) ) { - exit_status = ret; - goto cleanup; - } - } - else { - /* Still need the tool listener so we can tell it that we cannot do - * anything if they ask. - */ - if( ORTE_SUCCESS != (ret = orte_errmgr_base_tool_finalize()) ) { - ORTE_ERROR_LOG(ret); - return ret; - } - } - - if( mca_errmgr_hnp_component.autor_enabled ) { - if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_autor_global_module_finalize()) ) { - exit_status = ret; - goto cleanup; - } - } -#endif /* OPAL_ENABLE_FT_CR */ - - if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_base_global_finalize()) ) { - exit_status = ret; - goto cleanup; - } - -cleanup: - return exit_status; -} - -int orte_errmgr_hnp_global_update_state(orte_jobid_t job, - orte_job_state_t jobstate, - orte_process_name_t *proc_name, - orte_proc_state_t state, - pid_t pid, - orte_exit_code_t exit_code) -{ - int ret, exit_status = ORTE_SUCCESS; - - mca_errmgr_hnp_component.ignore_current_update = false; - - if (orte_finalizing || - orte_job_term_ordered || - ORTE_PROC_STATE_TERMINATED == state ) { - mca_errmgr_hnp_component.term_in_progress = true; - } - - OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output, - "errmgr:hnp:update_state() %s) " - "------- %s state updated for process %s to %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ((NULL == proc_name) ? "App. Process" : - (proc_name->jobid == ORTE_PROC_MY_HNP->jobid ? "Daemon" : "App. Process")), - (NULL == proc_name) ? "NULL" : ORTE_NAME_PRINT(proc_name), - orte_proc_state_to_str(state))); - -#if OPAL_ENABLE_FT_CR - if( mca_errmgr_hnp_component.crmig_enabled && - !mca_errmgr_hnp_component.autor_in_progress) { - if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_crmig_global_update_state(job, - jobstate, - proc_name, - state, - pid, - exit_code)) ) { - exit_status = ret; - goto cleanup; - } - } - - if( mca_errmgr_hnp_component.autor_enabled && - !mca_errmgr_hnp_component.crmig_in_progress) { - if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_autor_global_update_state(job, - jobstate, - proc_name, - state, - pid, - exit_code)) ) { - exit_status = ret; - goto cleanup; - } - } -#endif /* OPAL_ENABLE_FT_CR */ - - if( !mca_errmgr_hnp_component.ignore_current_update ) { - if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_base_global_update_state(job, - jobstate, - proc_name, - state, - pid, - exit_code)) ) { - exit_status = ret; - goto cleanup; - } - } - -cleanup: - return exit_status; -} - -int orte_errmgr_hnp_global_predicted_fault(opal_list_t *proc_list, - opal_list_t *node_list, - opal_list_t *suggested_map) -{ -#if OPAL_ENABLE_FT_CR - int ret, exit_status = ORTE_SUCCESS; - - if( mca_errmgr_hnp_component.crmig_enabled ) { - if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_crmig_global_predicted_fault(proc_list, - node_list, - suggested_map)) ) { - exit_status = ret; - goto cleanup; - } - } - /* - * If Process migration is not enabled, then return an error the tool - * which will print an appropriate message for the user. - */ - else { - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp:predicted_fault() Command line asked for a migration, but it is not enabled\n")); - orte_errmgr_base_migrate_update(ORTE_ERRMGR_MIGRATE_STATE_ERROR); - exit_status = ORTE_ERR_NOT_IMPLEMENTED; - goto cleanup; - } - -cleanup: - return exit_status; -#else - return ORTE_ERR_NOT_IMPLEMENTED; -#endif /* OPAL_ENABLE_FT_CR */ -} - -int orte_errmgr_hnp_global_suggest_map_targets(orte_proc_t *proc, - orte_node_t *oldnode, - opal_list_t *node_list) -{ -#if OPAL_ENABLE_FT_CR - int ret, exit_status = ORTE_ERR_NOT_IMPLEMENTED; - - if( mca_errmgr_hnp_component.crmig_enabled && - !mca_errmgr_hnp_component.autor_in_progress ) { - exit_status = ORTE_SUCCESS; - if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_crmig_global_suggest_map_targets(proc, - oldnode, - node_list)) ) { - exit_status = ret; - goto cleanup; - } - } - - if( mca_errmgr_hnp_component.autor_enabled && - !mca_errmgr_hnp_component.crmig_in_progress ) { - exit_status = ORTE_SUCCESS; - if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_autor_global_suggest_map_targets(proc, - oldnode, - node_list)) ) { - exit_status = ret; - goto cleanup; - } - } - -cleanup: - return exit_status; -#else - return ORTE_ERR_NOT_IMPLEMENTED; -#endif /* OPAL_ENABLE_FT_CR */ -} - -int orte_errmgr_hnp_global_ft_event(int state) -{ - int ret, exit_status = ORTE_SUCCESS; - -#if OPAL_ENABLE_FT_CR - if( !mca_errmgr_hnp_component.crmig_enabled ) { - if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_crmig_global_ft_event(state)) ) { - exit_status = ret; - goto cleanup; - } - } - - if( !mca_errmgr_hnp_component.autor_enabled ) { - if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_autor_global_ft_event(state)) ) { - exit_status = ret; - goto cleanup; - } - } -#endif /* OPAL_ENABLE_FT_CR */ - - if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_base_global_ft_event(state)) ) { - exit_status = ret; - goto cleanup; - } - -cleanup: - return exit_status; -} - - -/********************** - * From HNP - **********************/ -int orte_errmgr_hnp_base_global_init(void) -{ - int ret = ORTE_SUCCESS; - -#if ORTE_RESIL_ORTE - ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_FAILURE_NOTICE, - ORTE_RML_PERSISTENT, failure_notification, NULL); -#endif - - return ret; -} - -int orte_errmgr_hnp_base_global_finalize(void) -{ -#if ORTE_RESIL_ORTE - orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_FAILURE_NOTICE); -#endif - - return ORTE_SUCCESS; -} - -int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job, - orte_job_state_t jobstate, - orte_process_name_t *proc, - orte_proc_state_t state, - pid_t pid, - orte_exit_code_t exit_code) -{ - orte_job_t *jdata; - orte_exit_code_t sts; - orte_odls_child_t *child; - int rc; - orte_app_context_t *app; - orte_proc_t *pdat; - - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "%s errmgr:hnp: job %s reported state %s" - " for proc %s state %s pid %d exit_code %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(job), - orte_job_state_to_str(jobstate), - (NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc), - orte_proc_state_to_str(state), pid, exit_code)); - - /* - * if orte is trying to shutdown, just let it - */ - if (orte_finalizing) { - return ORTE_SUCCESS; - } - - if (NULL == proc) { - /* this is an update for an entire local job */ - if (ORTE_JOBID_INVALID == job) { - /* whatever happened, we don't know what job - * it happened to - */ - if (ORTE_JOB_STATE_NEVER_LAUNCHED == jobstate) { - orte_never_launched = true; - } - orte_show_help("help-orte-errmgr-hnp.txt", "errmgr-hnp:unknown-job-error", - true, orte_job_state_to_str(jobstate)); - hnp_abort(job, exit_code); - return ORTE_SUCCESS; - } - - /* get the job object */ - if (NULL == (jdata = orte_get_job_data_object(job))) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - return ORTE_ERR_NOT_FOUND; - } - /* update the state */ - jdata->state = jobstate; - - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "%s errmgr:hnp: job %s reported state %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(jdata->jobid), - orte_job_state_to_str(jobstate))); - - switch (jobstate) { - case ORTE_JOB_STATE_TERMINATED: - /* support batch-operated jobs */ - update_local_procs_in_job(jdata, jobstate, ORTE_PROC_STATE_TERMINATED, 0); - jdata->num_terminated = jdata->num_procs; - check_job_complete(jdata); - break; - - case ORTE_JOB_STATE_ABORTED: - /* support batch-operated jobs */ - update_local_procs_in_job(jdata, jobstate, ORTE_PROC_STATE_ABORTED, exit_code); - jdata->num_terminated = jdata->num_procs; - check_job_complete(jdata); - break; - - case ORTE_JOB_STATE_FAILED_TO_START: - failed_start(jdata); - check_job_complete(jdata); /* set the local proc states */ - /* the job object for this job will have been NULL'd - * in the array if the job was solely local. If it isn't - * NULL, then we need to tell everyone else to die - */ - if (NULL != (jdata = orte_get_job_data_object(job))) { - sts = exit_code; - if (ORTE_PROC_MY_NAME->jobid == job && !orte_abnormal_term_ordered) { - /* set the flag indicating that a daemon failed so we use the proper - * methods for attempting to shutdown the rest of the system - */ - orte_abnormal_term_ordered = true; - if (WIFSIGNALED(exit_code)) { /* died on signal */ -#ifdef WCOREDUMP - if (WCOREDUMP(exit_code)) { - orte_show_help("help-plm-base.txt", "daemon-died-signal-core", true, - WTERMSIG(exit_code)); - sts = WTERMSIG(exit_code); - } else { - orte_show_help("help-plm-base.txt", "daemon-died-signal", true, - WTERMSIG(exit_code)); - sts = WTERMSIG(exit_code); - } -#else - orte_show_help("help-plm-base.txt", "daemon-died-signal", true, - WTERMSIG(exit_code)); - sts = WTERMSIG(exit_code); -#endif /* WCOREDUMP */ - } else { - orte_show_help("help-plm-base.txt", "daemon-died-no-signal", true, - WEXITSTATUS(exit_code)); - sts = WEXITSTATUS(exit_code); - } - } - hnp_abort(jdata->jobid, sts); - } - break; - - case ORTE_JOB_STATE_SILENT_ABORT: - failed_start(jdata); - check_job_complete(jdata); /* set the local proc states */ - /* the job object for this job will have been NULL'd - * in the array if the job was solely local. If it isn't - * NULL, then we need to tell everyone else to die - */ - if (NULL != (jdata = orte_get_job_data_object(job))) { - if (ORTE_PROC_MY_NAME->jobid == job && !orte_abnormal_term_ordered) { - /* set the flag indicating that a daemon failed so we use the proper - * methods for attempting to shutdown the rest of the system - */ - orte_abnormal_term_ordered = true; - } - hnp_abort(jdata->jobid, exit_code); - } - break; - - case ORTE_JOB_STATE_RUNNING: - /* update all procs in job */ - update_local_procs_in_job(jdata, jobstate, ORTE_PROC_STATE_RUNNING, 0); - /* record that we reported */ - jdata->num_daemons_reported++; - /* report if requested */ - if (orte_report_launch_progress) { - if (0 == jdata->num_daemons_reported % 100 || jdata->num_daemons_reported == orte_process_info.num_procs) { - opal_output(orte_clean_output, "Reported: %d (out of %d) daemons - %d (out of %d) procs", - (int)jdata->num_daemons_reported, (int)orte_process_info.num_procs, - (int)jdata->num_launched, (int)jdata->num_procs); - } - } - break; - case ORTE_JOB_STATE_NEVER_LAUNCHED: - orte_never_launched = true; - jdata->num_terminated = jdata->num_procs; - check_job_complete(jdata); /* set the local proc states */ - /* the job object for this job will have been NULL'd - * in the array if the job was solely local. If it isn't - * NULL, then we need to tell everyone else to die - */ - if (NULL != (jdata = orte_get_job_data_object(job))) { - hnp_abort(jdata->jobid, exit_code); - } - break; - case ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED: - /* update all procs in job */ - update_local_procs_in_job(jdata, jobstate, - ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED, - exit_code); - /* order all local procs for this job to be killed */ - killprocs(jdata->jobid, ORTE_VPID_WILDCARD); - check_job_complete(jdata); /* set the local proc states */ - /* the job object for this job will have been NULL'd - * in the array if the job was solely local. If it isn't - * NULL, then we need to tell everyone else to die - */ - if (NULL != (jdata = orte_get_job_data_object(job))) { - hnp_abort(jdata->jobid, exit_code); - } - break; - case ORTE_JOB_STATE_COMM_FAILED: - /* order all local procs for this job to be killed */ - killprocs(jdata->jobid, ORTE_VPID_WILDCARD); - check_job_complete(jdata); /* set the local proc states */ - /* the job object for this job will have been NULL'd - * in the array if the job was solely local. If it isn't - * NULL, then we need to tell everyone else to die - */ - if (NULL != (jdata = orte_get_job_data_object(job))) { - hnp_abort(jdata->jobid, exit_code); - } - break; - case ORTE_JOB_STATE_HEARTBEAT_FAILED: - /* order all local procs for this job to be killed */ - killprocs(jdata->jobid, ORTE_VPID_WILDCARD); - check_job_complete(jdata); /* set the local proc states */ - /* the job object for this job will have been NULL'd - * in the array if the job was solely local. If it isn't - * NULL, then we need to tell everyone else to die - */ - if (NULL != (jdata = orte_get_job_data_object(job))) { - hnp_abort(jdata->jobid, exit_code); - } - break; - - default: - break; - } - return ORTE_SUCCESS; - } - - /* get the job object */ - if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) { - /* if the orteds are terminating, check job complete */ - if (orte_orteds_term_ordered) { - opal_output(0, "TERM ORDERED - CHECKING COMPLETE"); - check_job_complete(NULL); - return ORTE_SUCCESS; - } else { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - return ORTE_ERR_NOT_FOUND; - } - } - -#if OPAL_ENABLE_FT_CR - /* Notify the process state to the notifier framework if it is - active and selected. */ - orte_errmgr_base_proc_state_notify(state, proc); -#endif - - /* update is for a specific proc */ - switch (state) { - case ORTE_PROC_STATE_ABORTED: - case ORTE_PROC_STATE_ABORTED_BY_SIG: - case ORTE_PROC_STATE_TERM_WO_SYNC: - if( jdata->enable_recovery ) { - /* is this a local proc */ - if (NULL != (child = proc_is_local(proc))) { - /* local proc - see if it has reached its restart limit */ - app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, child->app_idx); - if (child->restarts < app->max_restarts) { - child->restarts++; - if (ORTE_SUCCESS == (rc = orte_odls.restart_proc(child))) { - return ORTE_SUCCESS; - } - /* reset the child's state as restart_proc would - * have cleared it - */ - child->state = state; - /* see if we can relocate it somewhere else */ - if (ORTE_SUCCESS == hnp_relocate(jdata, proc, state, exit_code)) { - return ORTE_SUCCESS; - } - /* let it fall thru to abort */ - } - } else { - /* this is a remote process - see if we can relocate it */ - if (ORTE_SUCCESS == hnp_relocate(jdata, proc, state, exit_code)) { - return ORTE_SUCCESS; - } - /* guess not - let it fall thru to abort */ - } - } - - orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code); - check_job_complete(jdata); /* need to set the job state */ - /* the job object for this job will have been NULL'd - * in the array if the job was solely local. If it isn't - * NULL, then we need to tell everyone else to die - */ - if (NULL != (jdata = orte_get_job_data_object(proc->jobid))) { - hnp_abort(jdata->jobid, exit_code); - } - break; - - case ORTE_PROC_STATE_FAILED_TO_START: - case ORTE_PROC_STATE_CALLED_ABORT: - orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code); - check_job_complete(jdata); - /* the job object for this job will have been NULL'd - * in the array if the job was solely local. If it isn't - * NULL, then we need to tell everyone else to die - */ - if (NULL != (jdata = orte_get_job_data_object(proc->jobid))) { - hnp_abort(jdata->jobid, exit_code); - } - break; - - case ORTE_PROC_STATE_REGISTERED: - case ORTE_PROC_STATE_RUNNING: - orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code); - break; - - case ORTE_PROC_STATE_LAUNCHED: - /* record the pid for this child */ - orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code); - break; - - case ORTE_PROC_STATE_TERMINATED: - case ORTE_PROC_STATE_TERM_NON_ZERO: - case ORTE_PROC_STATE_KILLED_BY_CMD: - orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code); - check_job_complete(jdata); - break; - - case ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED: - if (jdata->enable_recovery) { - killprocs(proc->jobid, proc->vpid); - /* is this a local proc */ - if (NULL != (child = proc_is_local(proc))) { - /* local proc - see if it has reached its restart limit */ - app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, child->app_idx); - if (child->restarts < app->max_restarts) { - child->restarts++; - if (ORTE_SUCCESS == (rc = orte_odls.restart_proc(child))) { - return ORTE_SUCCESS; - } - /* reset the child's state as restart_proc would - * have cleared it - */ - child->state = state; - /* see if we can relocate it somewhere else */ - if (ORTE_SUCCESS == hnp_relocate(jdata, proc, state, exit_code)) { - return ORTE_SUCCESS; - } - /* let it fall thru to abort */ - } - } else { - /* this is a remote process - see if we can relocate it */ - if (ORTE_SUCCESS == hnp_relocate(jdata, proc, state, exit_code)) { - return ORTE_SUCCESS; - } - /* guess not - let it fall thru to abort */ - } - } - /* kill all jobs */ - orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code); - check_job_complete(jdata); /* need to set the job state */ - /* the job object for this job will have been NULL'd - * in the array if the job was solely local. If it isn't - * NULL, then we need to tell everyone else to die - */ - if (NULL != (jdata = orte_get_job_data_object(proc->jobid))) { - hnp_abort(jdata->jobid, exit_code); - } - break; - - case ORTE_PROC_STATE_COMM_FAILED: - /* is this to a daemon? */ - if (ORTE_PROC_MY_NAME->jobid == proc->jobid) { - /* if this is my own connection, ignore it */ - if (ORTE_PROC_MY_NAME->vpid == proc->vpid) { - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s My own connection - ignoring it", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - break; - } - /* if we have ordered orteds to terminate, record it */ - if (orte_orteds_term_ordered) { - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s Daemons terminating - recording daemon %s as gone", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); - /* remove from dependent routes, if it is one */ - orte_routed.route_lost(proc); - /* update daemon job */ - orte_errmgr_hnp_record_dead_process(proc); - /* We'll check if the job was complete when we get the - * message back from the HNP notifying us of the dead - * process - */ - check_job_complete(jdata); - break; - } - /* if abort is in progress, see if this one failed to tell - * us it had terminated - */ - if (orte_abnormal_term_ordered) { - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s Abort in progress - recording daemon %s as gone", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); - /* remove from dependent routes, if it is one */ - orte_routed.route_lost(proc); - /* update daemon job */ - orte_errmgr_hnp_record_dead_process(proc); - /* We'll check if the job was complete when we get the - * message back from the HNP notifying us of the dead - * process - */ - check_job_complete(jdata); - break; - } - - /* remove from dependent routes, if it is one */ - orte_routed.route_lost(proc); - /* delete the route */ - orte_routed.delete_route(proc); - /* purge the oob */ - orte_rml.purge(proc); - - if( orte_enable_recovery ) { - /* relocate its processes */ - if (ORTE_SUCCESS != (rc = hnp_relocate(jdata, proc, state, exit_code))) { - /* unable to relocate for some reason */ - opal_output(0, "%s UNABLE TO RELOCATE PROCS FROM FAILED DAEMON %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc)); - /* kill all local procs */ - killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); - /* kill all jobs */ - hnp_abort(ORTE_JOBID_WILDCARD, exit_code); - /* check if all is complete so we can terminate */ - check_job_complete(jdata); - } - } else { -#if !ORTE_RESIL_ORTE - if (NULL == (pdat = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - orte_show_help("help-orte-errmgr-hnp.txt", "errmgr-hnp:daemon-died", true, - ORTE_VPID_PRINT(proc->vpid), "Unknown"); - } else { - orte_show_help("help-orte-errmgr-hnp.txt", "errmgr-hnp:daemon-died", true, - ORTE_VPID_PRINT(proc->vpid), - (NULL == pdat->node) ? "Unknown" : - ((NULL == pdat->node->name) ? "Unknown" : pdat->node->name)); - } -#endif - if (ORTE_SUCCESS != orte_errmgr_hnp_record_dead_process(proc)) { - /* The process is already dead so don't keep trying to do - * this stuff. */ - return ORTE_SUCCESS; - } - -#if !ORTE_RESIL_ORTE - /* kill all local procs */ - killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); - /* kill all jobs */ - hnp_abort(ORTE_JOBID_WILDCARD, exit_code); -#endif - /* We'll check if the job was complete when we get the - * message back from the HNP notifying us of the dead - * process */ - check_job_complete(jdata); - } - } - break; - - case ORTE_PROC_STATE_HEARTBEAT_FAILED: - /* heartbeats are only from daemons */ - if( orte_enable_recovery ) { - /* relocate its processes */ - } else { - orte_errmgr_hnp_record_dead_process(proc); - /* kill all local procs */ - killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); - /* kill all jobs */ - hnp_abort(ORTE_JOBID_WILDCARD, exit_code); - return ORTE_ERR_UNRECOVERABLE; - } - break; - - default: - break; - } - - return ORTE_SUCCESS; -} - -int orte_errmgr_hnp_base_global_ft_event(int state) -{ - return ORTE_SUCCESS; -} - -#if ORTE_RESIL_ORTE -static void failure_notification(int status, orte_process_name_t* sender, - opal_buffer_t *buffer, orte_rml_tag_t tag, - void* cbdata) -{ - orte_std_cntr_t n; - int ret = ORTE_SUCCESS, num_failed; - opal_pointer_array_t *dead_names; - int32_t i; - orte_process_name_t *name_item; - orte_epoch_t epoch; - orte_job_t *jdat; - orte_proc_t *pdat, *pdat2; - opal_buffer_t *answer; - - /* If processes have started terminating, don't worry about reported - * failures. The ORTEDs don't know the difference. */ - if (mca_errmgr_hnp_component.term_in_progress) { - return; - } - - if (orte_debug_daemons_flag) { - opal_output(0, "%s errmgr:hnp HNP received process failed from orted %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(sender)); - } - - n = 1; - /* Get the number of failed procs */ - if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &num_failed, &n, ORTE_VPID))) { - ORTE_ERROR_LOG(ret); - return; - } - - dead_names = OBJ_NEW(opal_pointer_array_t); - - for (i = 0; i < num_failed; i++) { - name_item = (orte_process_name_t *) malloc(sizeof(orte_process_name_t)); - - /* Unpack the buffer to get the dead process' name. */ - n = 1; - if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, name_item, &n, ORTE_NAME))) { - ORTE_ERROR_LOG(ret); - return; - } - - /* Check to see if the message is telling us about an old epoch. - * If so ignore the message. - */ - epoch = orte_util_lookup_epoch(name_item); - if (name_item->epoch < epoch) { - if (orte_debug_daemons_flag) { - opal_output(0, "%s errmgr:hnp HNP ignoring duplicate notification for %s failure (reported epoch: %s local epoch: %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(name_item), - ORTE_EPOCH_PRINT(name_item->epoch), - ORTE_EPOCH_PRINT(epoch)); - } - free(name_item); - continue; - } else { - if (orte_debug_daemons_flag) { - opal_output(0, "%s errmgr:hnp HNP received notification for %s failure (reported epoch: %s local epoch: %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(name_item), - ORTE_EPOCH_PRINT(name_item->epoch), - ORTE_EPOCH_PRINT(epoch)); - } - } - - opal_pointer_array_add(dead_names, name_item); - - /* Check to see if the message is telling us about an orted and - * it is from another orted. Orteds don't have the list of all - * the application processes so they don't know if there were - * any child processes on the nodes that they are reporting. */ - if (OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_ALL, sender, ORTE_PROC_MY_NAME)) { - if (NULL == (jdat = orte_get_job_data_object(name_item->jobid))) { - continue; - } else if (NULL == (pdat = (orte_proc_t *) opal_pointer_array_get_item(jdat->procs, name_item->vpid))) { - continue; - } else if (NULL == pdat->node) { - continue; - } - - if (ORTE_PROC_MY_NAME->jobid == name_item->jobid) { - for (i = 0; i < opal_pointer_array_get_size(pdat->node->procs); i++) { - if (NULL == (pdat2 = (orte_proc_t *) opal_pointer_array_get_item(pdat->node->procs, i))) { - continue; - } - - /* ignore this process if it has already terminated */ - if (ORTE_PROC_STATE_TERMINATED <= pdat2->state) { - continue; - } - - /* the proc must have been alive, so notify everyone that it died */ - name_item = (orte_process_name_t *) malloc(sizeof(orte_process_name_t)); - - name_item->jobid = pdat2->name.jobid; - name_item->vpid = pdat2->name.vpid; - name_item->epoch = orte_util_lookup_epoch(&(pdat2->name)); - - opal_pointer_array_add(dead_names, name_item); - } - } - } - - } - - /* Update the number of failed process so any duplicates don't get - * re-reported. - */ - num_failed = opal_pointer_array_get_size(dead_names); - - if (num_failed > 0) { - orte_errmgr_hnp_global_mark_processes_as_dead(dead_names); - - if (!orte_orteds_term_ordered) { - /* Send a message out to all the orteds to inform them that the - * process is dead. Long live the process (or not if it is so - * decided)! - */ - answer = OBJ_NEW(opal_buffer_t); - - if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &num_failed, 1, ORTE_VPID))) { - ORTE_ERROR_LOG(ret); - OBJ_RELEASE(answer); - return; - } - - for (i = 0; i < opal_pointer_array_get_size(dead_names); i++) { - if (NULL != (name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_names, i))) { - if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, name_item, 1, ORTE_NAME))) { - ORTE_ERROR_LOG(ret); - OBJ_RELEASE(answer); - return; - } - } - } - - if (ORTE_SUCCESS != (ret = orte_grpcomm.xcast(ORTE_PROC_MY_NAME->jobid, answer, ORTE_RML_TAG_FAILURE_NOTICE))) { - ORTE_ERROR_LOG(ret); - OBJ_RELEASE(answer); - return; - } - - /* Tell the applications' ORTE layers that there is a failure. */ - if (ORTE_SUCCESS != (ret = send_to_local_applications(dead_names))) { - return; - } - } - - for (i = 0; i < num_failed; i++) { - name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_names, i); - free(name_item); - } - } - - OBJ_RELEASE(dead_names); -} -#endif - -/***************** - * Local Functions - *****************/ -static void hnp_abort(orte_jobid_t job, orte_exit_code_t exit_code) -{ - int rc; - - /* if we are already in progress, then ignore this call */ - if (opal_atomic_trylock(&orte_abort_inprogress_lock)) { /* returns 1 if already locked */ - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "%s errmgr:hnp: abort in progress, ignoring abort on job %s with status %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(job), exit_code)); - return; - } - - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "%s errmgr:hnp: abort called on job %s with status %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(job), exit_code)); - - /* if debuggers are running, clean up */ - orte_debugger.finalize(); - - /* set control params to indicate we are terminating */ - orte_job_term_ordered = true; - orte_abnormal_term_ordered = true; - orte_enable_recovery = false; - - /* set the exit status, just in case whomever called us failed - * to do so - it can only be done once, so we are protected - * from overwriting it - */ - ORTE_UPDATE_EXIT_STATUS(exit_code); - - /* tell the plm to terminate the orteds - they will automatically - * kill their local procs - */ - if (ORTE_SUCCESS != (rc = orte_plm.terminate_orteds())) { - ORTE_ERROR_LOG(rc); - } -} - -static void failed_start(orte_job_t *jdata) -{ - opal_list_item_t *item, *next; - orte_odls_job_t *jobdat; - orte_odls_child_t *child; - orte_proc_t *proc; - - /* lookup the local jobdat for this job */ - jobdat = NULL; - for (item = opal_list_get_first(&orte_local_jobdata); - item != opal_list_get_end(&orte_local_jobdata); - item = opal_list_get_next(item)) { - jobdat = (orte_odls_job_t*)item; - - /* is this the specified job? */ - if (jobdat->jobid == jdata->jobid) { - break; - } - } - if (NULL == jobdat) { - /* race condition - may not have been formed yet */ - return; - } - jobdat->state = ORTE_JOB_STATE_FAILED_TO_START; - - OPAL_THREAD_LOCK(&orte_odls_globals.mutex); - - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = next) { - next = opal_list_get_next(item); - child = (orte_odls_child_t*)item; - if (child->name->jobid == jobdat->jobid) { - if (ORTE_PROC_STATE_LAUNCHED > child->state || - ORTE_PROC_STATE_UNTERMINATED < child->state) { - /* get the master proc object */ - proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, child->name->vpid); - proc->state = child->state; - proc->exit_code = child->exit_code; - /* update the counter so we can terminate */ - jdata->num_terminated++; - /* remove the child from our list */ - opal_list_remove_item(&orte_local_children, &child->super); - OBJ_RELEASE(child); - jobdat->num_local_procs--; - } - } - } - - opal_condition_signal(&orte_odls_globals.cond); - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); - - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "%s errmgr:hnp: job %s reported incomplete start", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(jdata->jobid))); -} - -static void update_local_procs_in_job(orte_job_t *jdata, orte_job_state_t jobstate, - orte_proc_state_t state, orte_exit_code_t exit_code) -{ - opal_list_item_t *item, *next; - orte_odls_job_t *jobdat; - orte_odls_child_t *child; - orte_proc_t *proc; - - /* lookup the local jobdat for this job */ - jobdat = NULL; - for (item = opal_list_get_first(&orte_local_jobdata); - item != opal_list_get_end(&orte_local_jobdata); - item = opal_list_get_next(item)) { - jobdat = (orte_odls_job_t*)item; - - /* is this the specified job? */ - if (jobdat->jobid == jdata->jobid) { - break; - } - } - if (NULL == jobdat) { - /* race condition - may not have been formed yet */ - return; - } - jobdat->state = jobstate; - jdata->state = jobstate; - - OPAL_THREAD_LOCK(&orte_odls_globals.mutex); - - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = next) { - next = opal_list_get_next(item); - child = (orte_odls_child_t*)item; - if (jdata->jobid == child->name->jobid) { - child->state = state; - proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, child->name->vpid); - proc->state = state; - if (proc->exit_code < exit_code) { - proc->exit_code = exit_code; - } - if (ORTE_PROC_STATE_UNTERMINATED < state) { - opal_list_remove_item(&orte_local_children, &child->super); - OBJ_RELEASE(child); - jdata->num_terminated++; - jobdat->num_local_procs--; - } else if (ORTE_PROC_STATE_RUNNING) { - jdata->num_launched++; - } else if (ORTE_PROC_STATE_REGISTERED == state) { - jdata->num_reported++; - if (jdata->dyn_spawn_active && - jdata->num_reported == jdata->num_procs) { - OPAL_RELEASE_THREAD(&jdata->dyn_spawn_lock, - &jdata->dyn_spawn_cond, - &jdata->dyn_spawn_active); - } - } - } - } - - opal_condition_signal(&orte_odls_globals.cond); - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); - -} - -void orte_errmgr_hnp_update_proc(orte_job_t *jdata, - orte_process_name_t *proc, - orte_proc_state_t state, - pid_t pid, - orte_exit_code_t exit_code) -{ - opal_list_item_t *item, *next; - orte_odls_child_t *child; - orte_proc_t *proct; - orte_odls_job_t *jobdat, *jdat; - int i; - - jobdat = NULL; - for (item = opal_list_get_first(&orte_local_jobdata); - item != opal_list_get_end(&orte_local_jobdata); - item = opal_list_get_next(item)) { - jdat = (orte_odls_job_t*)item; - if (jdat->jobid == jdata->jobid) { - jobdat = jdat; - break; - } - } - if (NULL == jobdat) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - } - - OPAL_THREAD_LOCK(&orte_odls_globals.mutex); - - /*** UPDATE LOCAL CHILD ***/ - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = next) { - next = opal_list_get_next(item); - child = (orte_odls_child_t*)item; - if (child->name->jobid == proc->jobid) { - if (child->name->vpid == proc->vpid) { - child->state = state; - if (0 < pid) { - child->pid = pid; - } - child->exit_code = exit_code; - proct = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, child->name->vpid); - proct->state = state; - if (0 < pid) { - proct->pid = pid; - } - proct->exit_code = exit_code; - if (ORTE_PROC_STATE_UNTERMINATED < state) { - if (!jdata->enable_recovery) { - opal_list_remove_item(&orte_local_children, &child->super); - OBJ_RELEASE(child); - if (NULL != jobdat) { - jobdat->num_local_procs--; - } - } - jdata->num_terminated++; - } else if (ORTE_PROC_STATE_RUNNING == state) { - jdata->num_launched++; - if (jdata->num_launched == jdata->num_procs) { - jdata->state = ORTE_JOB_STATE_RUNNING; - } - } else if (ORTE_PROC_STATE_REGISTERED == state) { - jdata->num_reported++; - if (jdata->dyn_spawn_active && - jdata->num_reported == jdata->num_procs) { - OPAL_RELEASE_THREAD(&jdata->dyn_spawn_lock, - &jdata->dyn_spawn_cond, - &jdata->dyn_spawn_active); - } - } - return; - } - } - } - - opal_condition_signal(&orte_odls_globals.cond); - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); - - /*** UPDATE REMOTE CHILD ***/ - for (i=0; i < jdata->procs->size; i++) { - if (NULL == (proct = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) { - continue; - } - if (proct->name.jobid != proc->jobid || - proct->name.vpid != proc->vpid) { - continue; - } - proct->state = state; - if (0 < pid) { - proct->pid = pid; - } - proct->exit_code = exit_code; - if (ORTE_PROC_STATE_REGISTERED == state) { - jdata->num_reported++; - if (jdata->dyn_spawn_active && - jdata->num_reported == jdata->num_procs) { - OPAL_RELEASE_THREAD(&jdata->dyn_spawn_lock, - &jdata->dyn_spawn_cond, - &jdata->dyn_spawn_active); - } - } else if (ORTE_PROC_STATE_UNTERMINATED < state) { - /* update the counter so we can terminate */ - jdata->num_terminated++; - } else if (ORTE_PROC_STATE_RUNNING == state) { - jdata->num_launched++; - if (jdata->num_launched == jdata->num_procs) { - jdata->state = ORTE_JOB_STATE_RUNNING; - } - } - return; - } -} - -static void check_job_complete(orte_job_t *jdata) -{ - orte_proc_t *proc; - int i; - orte_std_cntr_t j; - orte_job_t *job; - orte_node_t *node; - orte_job_map_t *map; - orte_std_cntr_t index; - bool one_still_alive; - orte_vpid_t non_zero=0, lowest=0; - char *msg; - -#if 0 - /* Check if FileM is active. If so then keep processing. */ - OPAL_ACQUIRE_THREAD(&orte_filem_base_lock, &orte_filem_base_cond, &orte_filem_base_is_active); -#endif - if (NULL == jdata) { - /* just check to see if the daemons are complete */ - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:hnp:check_job_complete - received NULL job, checking daemons", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - goto CHECK_DAEMONS; - } - - for (i=0; i < jdata->procs->size && !jdata->abort; i++) { - if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) { - /* the proc array may no longer be left justified, so - * we need to check everything - */ - continue; - } - - if (0 != proc->exit_code) { - non_zero++; - if (0 == lowest) { - lowest = proc->exit_code; - } - } - - switch (proc->state) { - case ORTE_PROC_STATE_KILLED_BY_CMD: - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:hnp:check_job_completed proc %s killed by cmd", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&proc->name))); - /* we ordered this proc to die, so it isn't an abnormal termination - * and we don't flag it as such - just check the remaining jobs to - * see if anyone is still alive - */ - if (jdata->num_terminated >= jdata->num_procs) { - /* this job has terminated - now we need to check to see if ALL - * the other jobs have also completed and wakeup if that is true - */ - if (!jdata->abort) { - jdata->state = ORTE_JOB_STATE_KILLED_BY_CMD; - } - } - goto CHECK_ALIVE; - break; - case ORTE_PROC_STATE_ABORTED: - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:hnp:check_job_completed proc %s aborted", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&proc->name))); - if (!jdata->abort) { - jdata->state = ORTE_JOB_STATE_ABORTED; - /* point to the lowest rank to cause the problem */ - jdata->aborted_proc = proc; - /* retain the object so it doesn't get free'd */ - OBJ_RETAIN(proc); - jdata->abort = true; - ORTE_UPDATE_EXIT_STATUS(proc->exit_code); - } - break; - case ORTE_PROC_STATE_FAILED_TO_START: - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr_hnp:check_job_completed proc %s failed to start", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&proc->name))); - if (!jdata->abort) { - jdata->state = ORTE_JOB_STATE_FAILED_TO_START; - /* point to the lowest rank to cause the problem */ - jdata->aborted_proc = proc; - /* retain the object so it doesn't get free'd */ - OBJ_RETAIN(proc); - jdata->abort = true; - ORTE_UPDATE_EXIT_STATUS(proc->exit_code); - } - break; - case ORTE_PROC_STATE_ABORTED_BY_SIG: - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:hnp:check_job_completed proc %s aborted by signal", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&proc->name))); - if (!jdata->abort) { - jdata->state = ORTE_JOB_STATE_ABORTED_BY_SIG; - /* point to the lowest rank to cause the problem */ - jdata->aborted_proc = proc; - /* retain the object so it doesn't get free'd */ - OBJ_RETAIN(proc); - jdata->abort = true; - ORTE_UPDATE_EXIT_STATUS(proc->exit_code); - } - break; - case ORTE_PROC_STATE_TERM_WO_SYNC: - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:hnp:check_job_completed proc %s terminated without sync", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&proc->name))); - if (!jdata->abort) { - jdata->state = ORTE_JOB_STATE_ABORTED_WO_SYNC; - /* point to the lowest rank to cause the problem */ - jdata->aborted_proc = proc; - /* retain the object so it doesn't get free'd */ - OBJ_RETAIN(proc); - jdata->abort = true; - ORTE_UPDATE_EXIT_STATUS(proc->exit_code); - /* now treat a special case - if the proc exit'd without a required - * sync, it may have done so with a zero exit code. We want to ensure - * that the user realizes there was an error, so in this -one- case, - * we overwrite the process' exit code with the default error code - */ - ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); - } - break; - case ORTE_PROC_STATE_COMM_FAILED: - if (!jdata->abort) { - jdata->state = ORTE_JOB_STATE_COMM_FAILED; - /* point to the lowest rank to cause the problem */ - jdata->aborted_proc = proc; - /* retain the object so it doesn't get free'd */ - OBJ_RETAIN(proc); - jdata->abort = true; - ORTE_UPDATE_EXIT_STATUS(proc->exit_code); - } - break; - case ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED: - if (!jdata->abort) { - jdata->state = ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED; - /* point to the lowest rank to cause the problem */ - jdata->aborted_proc = proc; - /* retain the object so it doesn't get free'd */ - OBJ_RETAIN(proc); - jdata->abort = true; - ORTE_UPDATE_EXIT_STATUS(proc->exit_code); - } - break; - case ORTE_PROC_STATE_CALLED_ABORT: - if (!jdata->abort) { - jdata->state = ORTE_JOB_STATE_CALLED_ABORT; - /* point to the first proc to cause the problem */ - jdata->aborted_proc = proc; - /* retain the object so it doesn't get free'd */ - OBJ_RETAIN(proc); - jdata->abort = true; - ORTE_UPDATE_EXIT_STATUS(proc->exit_code); - } - break; - case ORTE_PROC_STATE_HEARTBEAT_FAILED: - if (!jdata->abort) { - jdata->state = ORTE_JOB_STATE_HEARTBEAT_FAILED; - /* point to the lowest rank to cause the problem */ - jdata->aborted_proc = proc; - /* retain the object so it doesn't get free'd */ - OBJ_RETAIN(proc); - jdata->abort = true; - ORTE_UPDATE_EXIT_STATUS(proc->exit_code); - } - break; - case ORTE_PROC_STATE_TERM_NON_ZERO: - ORTE_UPDATE_EXIT_STATUS(proc->exit_code); - if (orte_abort_non_zero_exit) { - if (!jdata->abort) { - jdata->state = ORTE_JOB_STATE_NON_ZERO_TERM; - /* point to the lowest rank to cause the problem */ - jdata->aborted_proc = proc; - /* retain the object so it doesn't get free'd */ - OBJ_RETAIN(proc); - jdata->abort = true; - } - } - break; - - default: - if (ORTE_PROC_STATE_UNTERMINATED < proc->state && - jdata->controls & ORTE_JOB_CONTROL_CONTINUOUS_OP) { - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:hnp:check_job_completed proc %s terminated and continuous", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&proc->name))); - if (!jdata->abort) { - proc->state = ORTE_PROC_STATE_ABORTED; - jdata->state = ORTE_JOB_STATE_ABORTED; - /* point to the lowest rank to cause the problem */ - jdata->aborted_proc = proc; - /* retain the object so it doesn't get free'd */ - OBJ_RETAIN(proc); - jdata->abort = true; - ORTE_UPDATE_EXIT_STATUS(proc->exit_code); - } - } - break; - } - } - - if (jdata->abort) { - /* the job aborted - turn off any sensors on this job */ - orte_sensor.stop(jdata->jobid); - } - - if (ORTE_JOB_STATE_UNTERMINATED > jdata->state && - jdata->num_terminated >= jdata->num_procs) { - /* this job has terminated */ - jdata->state = ORTE_JOB_STATE_TERMINATED; - - /* turn off any sensor monitors on this job */ - orte_sensor.stop(jdata->jobid); - - if (0 < non_zero) { - if (!orte_report_child_jobs_separately || 1 == ORTE_LOCAL_JOBID(jdata->jobid)) { - /* update the exit code */ - ORTE_UPDATE_EXIT_STATUS(lowest); - } - - /* warn user */ - opal_output(orte_clean_output, - "-------------------------------------------------------\n" - "While %s job %s terminated normally, %s %s. Further examination may be required.\n" - "-------------------------------------------------------", - (1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "the primary" : "child", - (1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "" : ORTE_LOCAL_JOBID_PRINT(jdata->jobid), - ORTE_VPID_PRINT(non_zero), - (1 == non_zero) ? "process returned\na non-zero exit code." : "processes returned\nnon-zero exit codes."); - } - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:hnp:check_job_completed declared job %s normally terminated - checking all jobs", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(jdata->jobid))); - } - - /* if this job is a continuously operating one, then don't do - * anything further - just return here - */ - if (NULL != jdata && - (ORTE_JOB_CONTROL_CONTINUOUS_OP & jdata->controls || - ORTE_JOB_CONTROL_RECOVERABLE & jdata->controls)) { - goto CHECK_ALIVE; - } - - /* if the job that is being checked is the HNP, then we are - * trying to terminate the orteds. In that situation, we - * do -not- check all jobs - we simply notify the HNP - * that the orteds are complete. Also check special case - * if jdata is NULL - we want - * to definitely declare the job done if the orteds - * have completed, no matter what else may be happening. - * This can happen if a ctrl-c hits in the "wrong" place - * while launching - */ -CHECK_DAEMONS: - if (jdata == NULL || jdata->jobid == ORTE_PROC_MY_NAME->jobid) { - if (0 == orte_routed.num_routes()) { - /* orteds are done! */ - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s orteds complete - exiting", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - if (NULL == jdata) { - jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); - } - jdata->state = ORTE_JOB_STATE_TERMINATED; - orte_quit(); - return; - } - return; - } - - /* Release the resources used by this job. Since some errmgrs may want - * to continue using resources allocated to the job as part of their - * fault recovery procedure, we only do this once the job is "complete". - * Note that an aborted/killed job -is- flagged as complete and will - * therefore have its resources released. We need to do this after - * we call the errmgr so that any attempt to restart the job will - * avoid doing so in the exact same place as the current job - */ - if (NULL != jdata->map && jdata->state == ORTE_JOB_STATE_TERMINATED) { - map = jdata->map; - for (index = 0; index < map->nodes->size; index++) { - if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, index))) { - continue; - } - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s releasing procs from node %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - node->name)); - for (i = 0; i < node->procs->size; i++) { - if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) { - continue; - } - if (proc->name.jobid != jdata->jobid) { - /* skip procs from another job */ - continue; - } - node->slots_inuse--; - node->num_procs--; - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s releasing proc %s from node %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&proc->name), node->name)); - /* set the entry in the node array to NULL */ - opal_pointer_array_set_item(node->procs, i, NULL); - /* release the proc once for the map entry */ - OBJ_RELEASE(proc); - } - } - OBJ_RELEASE(map); - jdata->map = NULL; - } - -CHECK_ALIVE: - /* now check to see if all jobs are done - release this jdata - * object when we find it - */ - one_still_alive = false; - for (j=1; j < orte_job_data->size; j++) { - if (NULL == (job = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, j))) { - /* since we are releasing jdata objects as we - * go, we can no longer assume that the job_data - * array is left justified - */ - continue; - } - /* if this is the job we are checking AND it normally terminated, - * then go ahead and release it. We cannot release it if it - * abnormally terminated as mpirun needs the info so it can - * report appropriately to the user - * - * NOTE: do not release the primary job (j=1) so we - * can pretty-print completion message - */ - if (NULL != jdata && job->jobid == jdata->jobid && - (jdata->state == ORTE_JOB_STATE_TERMINATED || - jdata->state == ORTE_JOB_STATE_KILLED_BY_CMD)) { - /* release this object, ensuring that the - * pointer array internal accounting - * is maintained! - */ - if (1 < j) { - opal_pointer_array_set_item(orte_job_data, j, NULL); /* ensure the array has a NULL */ - OBJ_RELEASE(jdata); - } - continue; - } - /* if the job is flagged to not be monitored, skip it */ - if (ORTE_JOB_CONTROL_DO_NOT_MONITOR & job->controls) { - continue; - } - /* when checking for job termination, we must be sure to NOT check - * our own job as it - rather obviously - has NOT terminated! - */ - if (job->num_terminated < job->num_procs) { - /* we have at least one job that is not done yet - we cannot - * just return, though, as we need to ensure we cleanout the - * job data for the job that just completed - */ - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:hnp:check_job_completed job %s is not terminated (%d:%d)", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(job->jobid), - job->num_terminated, job->num_procs)); - one_still_alive = true; - } - else { - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:hnp:check_job_completed job %s is terminated (%d vs %d [%s])", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(job->jobid), - job->num_terminated, job->num_procs, - (NULL == jdata) ? "UNKNOWN" : orte_job_state_to_str(jdata->state) )); - } - } - /* if a job is still alive, we just return */ - if (one_still_alive) { - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:hnp:check_job_completed at least one job is not terminated", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - return; - } - /* if we get here, then all jobs are done, so terminate */ - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:hnp:check_job_completed all jobs terminated", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - /* set the exit status to 0 - this will only happen if it - * wasn't already set by an error condition - */ - ORTE_UPDATE_EXIT_STATUS(0); - /* provide a notifier message if that framework is active - ignored otherwise */ - if (NULL != (job = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, 1))) { - if (NULL == job->name) { - job->name = strdup(orte_process_info.nodename); - } - if (NULL == job->instance) { - asprintf(&job->instance, "%d", orte_process_info.pid); - } - if (0 == orte_exit_status) { - asprintf(&msg, "Job %s:%s complete", job->name, job->instance); - orte_notifier.log(ORTE_NOTIFIER_INFO, 0, msg); - } else { - asprintf(&msg, "Job %s:%s terminated abnormally", job->name, job->instance); - orte_notifier.log(ORTE_NOTIFIER_ALERT, orte_exit_status, msg); - } - free(msg); - /* this job object will be release during finalize */ - } - - orte_jobs_complete(); - /* if I am the only daemon alive, then I can exit now */ - if (0 == orte_routed.num_routes()) { - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s orteds complete - exiting", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - orte_quit(); - } -} - -static void killprocs(orte_jobid_t job, orte_vpid_t vpid) -{ - opal_pointer_array_t cmd; - orte_proc_t proc; - int rc; - - /* stop local sensors for this job */ - if (ORTE_VPID_WILDCARD == vpid) { - orte_sensor.stop(job); - } - - if (ORTE_JOBID_WILDCARD == job - && ORTE_VPID_WILDCARD == vpid) { - - if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(NULL))) { - ORTE_ERROR_LOG(rc); - } - return; - } - - OBJ_CONSTRUCT(&cmd, opal_pointer_array_t); - OBJ_CONSTRUCT(&proc, orte_proc_t); - proc.name.jobid = job; - proc.name.vpid = vpid; - ORTE_EPOCH_SET(proc.name.epoch,orte_ess.proc_get_epoch(&(proc.name))); - opal_pointer_array_add(&cmd, &proc); - if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(&cmd))) { - ORTE_ERROR_LOG(rc); - } - OBJ_DESTRUCT(&cmd); - OBJ_DESTRUCT(&proc); -} - -static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc, - orte_proc_state_t state, orte_exit_code_t exit_code) -{ - orte_job_t *jdat; - orte_proc_t *pdata, *pdt, *pdt2; - orte_node_t *node, *nd; - orte_app_context_t *app; - char *app_name; - int rc, i, n; - - OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, - "%s CHECKING ON RELOCATE FOR APP %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(proc))); - - /* get the proc_t object for this process */ - pdata = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid); - if (NULL == pdata) { - opal_output(0, "Data for proc %s could not be found", ORTE_NAME_PRINT(proc)); - return ORTE_ERR_NOT_FOUND; - } - - /* set the state */ - pdata->state = state; - - /* retain the node id */ - node = pdata->node; - - /* if it is a daemon that died, we need to flag all of its procs - * to be relocated - */ - if (ORTE_PROC_MY_NAME->jobid == proc->jobid) { - /* remove this proc from the daemon job */ - orte_errmgr_hnp_record_dead_process(proc); - /* check to see if any other nodes are "alive" */ - if (!orte_hnp_is_allocated && jdata->num_procs == 1) { - return ORTE_ERR_FATAL; - } - app_name = "orted"; - /* scan the procs looking for each unique jobid on the node */ - for (i=0; i < node->procs->size; i++) { - if (NULL == (pdt = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) { - continue; - } - /* get the job data object for this process */ - if (NULL == (jdat = orte_get_job_data_object(pdt->name.jobid))) { - /* major problem */ - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - continue; - } - /* since the node was used in this job's map, release - * it so that accounting is maintained - */ - OBJ_RELEASE(node); - /* mark this proc as dead so it will be restarted */ - pdt->state = ORTE_PROC_STATE_ABORTED; - /* remove this proc from the node */ - OBJ_RELEASE(pdt); /* maintains accounting */ - opal_pointer_array_set_item(node->procs, i, NULL); - /* maintain accounting on num procs alive in case this can't restart */ - jdat->num_terminated++; - /* look for all other procs on this node from the same job */ - for (n=0; n < node->procs->size; n++) { - if (NULL == (pdt2 = (orte_proc_t*)opal_pointer_array_get_item(node->procs, n))) { - continue; - } - if (pdt2->name.jobid == pdt->name.jobid) { - /* mark this proc as having aborted */ - pdt2->state = ORTE_PROC_STATE_ABORTED; - /* remove it from the node */ - OBJ_RELEASE(pdt2); - opal_pointer_array_set_item(node->procs, n, NULL); - /* maintain accounting on num procs alive */ - jdat->num_terminated++; - } - } - /* and remove the node from the map */ - for (n=0; n < jdat->map->nodes->size; n++) { - if (NULL == (nd = (orte_node_t*)opal_pointer_array_get_item(jdat->map->nodes, n))) { - continue; - } - if (nd->index == node->index) { - opal_pointer_array_set_item(jdat->map->nodes, n, NULL); - OBJ_RELEASE(node); /* maintain accounting */ - break; - } - } - /* reset the job params for this job */ - orte_plm_base_reset_job(jdat); - - /* relaunch the job */ - opal_output(0, "%s RELOCATING APPS FOR JOB %s FROM NODE %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdat->jobid), node->name); - if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdat))) { - opal_output(0, "FAILED TO RESTART APP %s on error %s", app_name, ORTE_ERROR_NAME(rc)); - return rc; - } - } - - return ORTE_SUCCESS; - } - - /* otherwise, we are an app - try to relocate us to another node */ - app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, pdata->app_idx); - if (NULL == app) { - /* no way to restart this job */ - orte_show_help("help-orte-errmgr-hnp.txt", "errmgr-hnp:cannot-relocate", true, - ORTE_NAME_PRINT(proc)); - return ORTE_ERR_NOT_FOUND; - } - app_name = app->app; - /* track that we are attempting to restart */ - pdata->restarts++; - /* have we exceeded the number of restarts for this proc? */ - if (app->max_restarts < pdata->restarts) { - return ORTE_ERR_RESTART_LIMIT_EXCEEDED; - } - - /* reset the job params for restart */ - orte_plm_base_reset_job(jdata); - - /* flag the current node as not-to-be-used */ - pdata->node->state = ORTE_NODE_STATE_DO_NOT_USE; - - /* restart the job - the spawn function will remap and - * launch the replacement proc(s) - */ - OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, - "%s RELOCATING APP %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(proc))); - - if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdata))) { - opal_output(0, "FAILED TO RESTART APP %s on error %s", app_name, ORTE_ERROR_NAME(rc)); - return rc; - } - - return ORTE_SUCCESS; -} - -static orte_odls_child_t* proc_is_local(orte_process_name_t *proc) -{ - orte_odls_child_t *child; - opal_list_item_t *item; - - child = NULL; - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; - if (child->name->jobid == proc->jobid && - child->name->vpid == proc->vpid) { - return child; - } - } - return NULL; -} - -#if ORTE_RESIL_ORTE -static void cbfunc(int status, - orte_process_name_t *peer, - opal_buffer_t *buffer, - orte_rml_tag_t tag, - void* cbdata) { - OBJ_RELEASE(buffer); -} -#endif - -int orte_errmgr_hnp_record_dead_process(orte_process_name_t *proc) { - orte_job_t *jdat; - orte_proc_t *pdat, *proc_item; - int i; - opal_pointer_array_t *dead_names; - - OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, - "%s RECORDING DEAD PROCESS %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(proc))); - - if (NULL == (jdat = orte_get_job_data_object(proc->jobid))) { - opal_output(0, "Can't find job object"); - return ORTE_ERR_NOT_FOUND; - } - - if (NULL != (pdat = (orte_proc_t*)opal_pointer_array_get_item(jdat->procs, proc->vpid)) && - ORTE_PROC_STATE_TERMINATED > pdat->state) { - -#if ORTE_ENABLE_EPOCH - /* Make sure that the epochs match. */ - if (proc->epoch != pdat->name.epoch) { - opal_output(1, "The epoch does not match the current epoch. Throwing the request out."); - return ORTE_SUCCESS; - } -#endif - - dead_names = OBJ_NEW(opal_pointer_array_t); - - if (ORTE_PROC_MY_NAME->jobid == proc->jobid) { - opal_pointer_array_add(dead_names, &(pdat->name)); - - for (i = 0; i < opal_pointer_array_get_size(pdat->node->procs); i++) { - if (NULL == (proc_item = (orte_proc_t *) opal_pointer_array_get_item(pdat->node->procs, i))) { - continue; - } - - opal_pointer_array_add(dead_names, &(proc_item->name)); - } - } - -#if ORTE_RESIL_ORTE - if (!mca_errmgr_hnp_component.term_in_progress) { - /* - * Send a message to the other daemons so they know that a daemon has - * died. - */ - int rc, num_failed = opal_pointer_array_get_size(dead_names); - opal_buffer_t* buffer = OBJ_NEW(opal_buffer_t); - orte_process_name_t *proc_name; - - if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &num_failed, 1, ORTE_VPID))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(buffer); - } else { - - /* Iterate over the list of dead procs and send them along with - * the rest. The HNP needs this info so it can tell the other - * ORTEDs and they can inform the appropriate applications. - */ - for (i = 0; i < num_failed; i++) { - if (NULL != (proc_name = (orte_process_name_t *) opal_pointer_array_get_item(dead_names, i))) { - if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, proc_name, 1, ORTE_NAME))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(buffer); - } - } - } - - OBJ_RELEASE(dead_names); - - OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, - "%s SENDING DEAD PROCESS MESSAGE TO HNP", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buffer, ORTE_RML_TAG_FAILURE_NOTICE, 0, cbfunc, NULL); - } - } else { - orte_errmgr_hnp_global_mark_processes_as_dead(dead_names); - } -#else - orte_errmgr_hnp_global_mark_processes_as_dead(dead_names); -#endif - } - - return ORTE_SUCCESS; -} - -int orte_errmgr_hnp_global_mark_processes_as_dead(opal_pointer_array_t *dead_procs) { - int i; - orte_process_name_t *name_item; - orte_job_t *jdat; - orte_proc_t *pdat; - orte_node_t *node; - - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "HNP %s marking procs as dead", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - /* Iterate over the list of processes */ - for (i = 0; i < opal_pointer_array_get_size(dead_procs); i++) { - if (NULL == (name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_procs, i))) { - opal_output(1, "NULL found in dead process list."); - continue; - } - - if (NULL == (jdat = orte_get_job_data_object(name_item->jobid))) { - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "%s Job data not found.", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - return ORTE_ERR_NOT_FOUND; - } - - if (NULL != (pdat = (orte_proc_t *) opal_pointer_array_get_item(jdat->procs, name_item->vpid)) && - pdat->state < ORTE_PROC_STATE_TERMINATED) { - - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "HNP %s marking %s as dead", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&pdat->name))); - -#if ORTE_RESIL_ORTE - /* Make sure the epochs match, if not it probably means that we - * already reported this failure. */ - if (name_item->epoch != pdat->name.epoch) { - continue; - } - - orte_util_set_epoch(name_item, name_item->epoch + 1); -#endif - - /* Remove it from the job array */ - opal_pointer_array_set_item(jdat->procs, name_item->vpid, NULL); - orte_process_info.num_procs--; - jdat->num_procs--; - - /* Check if this is an ORTED */ - if (ORTE_PROC_MY_NAME->jobid == name_item->jobid) { - /* Mark the node as down so it won't be used in mapping anymore. */ - node = pdat->node; - node->state = ORTE_NODE_STATE_DOWN; - node->daemon = NULL; - } - - OBJ_RELEASE(pdat); - -#if ORTE_RESIL_ORTE - /* Create a new proc object that will keep track of the epoch - * information */ - pdat = OBJ_NEW(orte_proc_t); - pdat->name.jobid = jdat->jobid; - pdat->name.vpid = name_item->vpid; - pdat->name.epoch = name_item->epoch + 1; - - opal_pointer_array_set_item(jdat->procs, name_item->vpid, pdat); - jdat->num_procs++; - jdat->num_terminated++; -#endif - /* Set the state as terminated so we'll know the process isn't - * actually there. */ - pdat->state = ORTE_PROC_STATE_TERMINATED; - } else { -#if ORTE_RESIL_ORTE - opal_output(0, "Proc data not found for %s", ORTE_NAME_PRINT(name_item)); - /* Create a new proc object that will keep track of the epoch - * information */ - pdat = OBJ_NEW(orte_proc_t); - pdat->name.jobid = jdat->jobid; - pdat->name.vpid = name_item->vpid; - pdat->name.epoch = name_item->epoch + 1; - - /* Set the state as terminated so we'll know the process isn't - * actually there. */ - pdat->state = ORTE_PROC_STATE_TERMINATED; - - opal_pointer_array_set_item(jdat->procs, name_item->vpid, pdat); - jdat->num_procs++; - jdat->num_terminated++; -#endif - } - - check_job_complete(jdat); - } - -#if ORTE_RESIL_ORTE - if (!mca_errmgr_hnp_component.term_in_progress) { - /* Need to update the orted routing module. */ - orte_routed.update_routing_tree(ORTE_PROC_MY_NAME->jobid); - - if (NULL != fault_cbfunc) { - (*fault_cbfunc)(dead_procs); - } - } -#endif - - return ORTE_SUCCESS; -} - -#if ORTE_RESIL_ORTE -int send_to_local_applications(opal_pointer_array_t *dead_names) { - opal_buffer_t *buf; - int ret = ORTE_SUCCESS; - orte_process_name_t *name_item; - int size, i; - - OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output, - "%s Sending failure to local applications.", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - buf = OBJ_NEW(opal_buffer_t); - - size = opal_pointer_array_get_size(dead_names); - - if (ORTE_SUCCESS != (ret = opal_dss.pack(buf, &size, 1, ORTE_VPID))) { - ORTE_ERROR_LOG(ret); - OBJ_RELEASE(buf); - return ret; - } - - for (i = 0; i < size; i++) { - if (NULL != (name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_names, i))) { - if (ORTE_SUCCESS != (ret = opal_dss.pack(buf, name_item, 1, ORTE_NAME))) { - ORTE_ERROR_LOG(ret); - OBJ_RELEASE(buf); - return ret; - } - } - } - - if (ORTE_SUCCESS != (ret = orte_odls.deliver_message(ORTE_JOBID_WILDCARD, buf, ORTE_RML_TAG_EPOCH_CHANGE))) { - ORTE_ERROR_LOG(ret); - OBJ_RELEASE(buf); - return ret; - } - - OBJ_RELEASE(buf); - - return ret; -} -#endif - diff --git a/orte/mca/errmgr/hnp/errmgr_hnp.h b/orte/mca/errmgr/hnp/errmgr_hnp.h deleted file mode 100644 index cd20532141..0000000000 --- a/orte/mca/errmgr/hnp/errmgr_hnp.h +++ /dev/null @@ -1,135 +0,0 @@ -/* - * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2004-2011 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -/** - * @file - * - */ - -#ifndef MCA_ERRMGR_hnp_EXPORT_H -#define MCA_ERRMGR_hnp_EXPORT_H - -#include "orte_config.h" - -#include "orte/mca/errmgr/errmgr.h" - -BEGIN_C_DECLS - -/* - * Local Component structures - */ -struct orte_errmgr_hnp_component_t { - orte_errmgr_base_component_t super; /** Base Errmgr component */ - - bool ignore_current_update; - bool term_in_progress; - -#if OPAL_ENABLE_FT_CR - /* State of the Recovery */ - bool crmig_in_progress; - bool autor_in_progress; - - /* CRMig Options */ - bool crmig_enabled; - bool crmig_timing_enabled; - - /* AutoR Options */ - bool autor_enabled; - bool autor_timing_enabled; - int autor_recovery_delay; - bool autor_skip_oldnode; -#endif -}; -typedef struct orte_errmgr_hnp_component_t orte_errmgr_hnp_component_t; -ORTE_MODULE_DECLSPEC extern orte_errmgr_hnp_component_t mca_errmgr_hnp_component; - -int orte_errmgr_hnp_component_query(mca_base_module_t **module, int *priority); - -void orte_errmgr_hnp_update_proc(orte_job_t *jdata, - orte_process_name_t *proc, - orte_proc_state_t state, - pid_t pid, - orte_exit_code_t exit_code); - -/*************************** - * Module functions: Global - ***************************/ -int orte_errmgr_hnp_global_module_init(void); -int orte_errmgr_hnp_global_module_finalize(void); - -int orte_errmgr_hnp_global_update_state(orte_jobid_t job, - orte_job_state_t jobstate, - orte_process_name_t *proc_name, - orte_proc_state_t state, - pid_t pid, - orte_exit_code_t exit_code); -int orte_errmgr_hnp_global_predicted_fault(opal_list_t *proc_list, - opal_list_t *node_list, - opal_list_t *suggested_map); -int orte_errmgr_hnp_global_suggest_map_targets(orte_proc_t *proc, - orte_node_t *oldnode, - opal_list_t *node_list); -int orte_errmgr_hnp_global_ft_event(int state); -int orte_errmgr_hnp_global_mark_processes_as_dead(opal_pointer_array_t *dead_procs); -int orte_errmgr_hnp_global_failure_notification(orte_process_name_t *sender, opal_buffer_t *buffer); -int orte_errmgr_hnp_record_dead_process(orte_process_name_t *proc); - -/* hnp Versions */ -int orte_errmgr_hnp_base_global_init(void); -int orte_errmgr_hnp_base_global_finalize(void); -int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job, - orte_job_state_t jobstate, - orte_process_name_t *proc, - orte_proc_state_t state, - pid_t pid, - orte_exit_code_t exit_code); -int orte_errmgr_hnp_base_global_ft_event(int state); - -#if OPAL_ENABLE_FT_CR -/* CRMig Versions */ -int orte_errmgr_hnp_crmig_global_module_init(void); -int orte_errmgr_hnp_crmig_global_module_finalize(void); - -int orte_errmgr_hnp_crmig_global_update_state(orte_jobid_t job, - orte_job_state_t jobstate, - orte_process_name_t *proc_name, - orte_proc_state_t state, - pid_t pid, - orte_exit_code_t exit_code); -int orte_errmgr_hnp_crmig_global_predicted_fault(opal_list_t *proc_list, - opal_list_t *node_list, - opal_list_t *suggested_map); -int orte_errmgr_hnp_crmig_global_suggest_map_targets(orte_proc_t *proc, - orte_node_t *oldnode, - opal_list_t *node_list); -int orte_errmgr_hnp_crmig_global_ft_event(int state); - -/* AutoR Versions */ -int orte_errmgr_hnp_autor_global_module_init(void); -int orte_errmgr_hnp_autor_global_module_finalize(void); - -int orte_errmgr_hnp_autor_global_update_state(orte_jobid_t job, - orte_job_state_t jobstate, - orte_process_name_t *proc_name, - orte_proc_state_t state, - pid_t pid, - orte_exit_code_t exit_code); -int orte_errmgr_hnp_autor_global_suggest_map_targets(orte_proc_t *proc, - orte_node_t *oldnode, - opal_list_t *node_list); -int orte_errmgr_hnp_autor_global_ft_event(int state); -#endif - -END_C_DECLS - -#endif /* MCA_ERRMGR_hnp_EXPORT_H */ diff --git a/orte/mca/errmgr/hnp/errmgr_hnp_autor.c b/orte/mca/errmgr/hnp/errmgr_hnp_autor.c deleted file mode 100644 index 4ba13ac35f..0000000000 --- a/orte/mca/errmgr/hnp/errmgr_hnp_autor.c +++ /dev/null @@ -1,1033 +0,0 @@ -/* - * Copyright (c) 2009-2011 The Trustees of Indiana University. - * All rights reserved. - * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. - * Copyright (c) 2004-2011 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "orte_config.h" - -#include -#ifdef HAVE_UNISTD_H -#include -#endif /* HAVE_UNISTD_H */ -#ifdef HAVE_STRING_H -#include -#endif - -#include "opal/util/show_help.h" -#include "opal/util/output.h" -#include "opal/util/opal_environ.h" -#include "opal/util/basename.h" -#include "opal/util/argv.h" -#include "opal/mca/mca.h" -#include "opal/mca/base/base.h" -#include "opal/mca/base/mca_base_param.h" -#include "opal/mca/crs/crs.h" -#include "opal/mca/crs/base/base.h" -#include "opal/mca/event/event.h" - -#include "orte/util/error_strings.h" -#include "orte/util/name_fns.h" -#include "orte/util/proc_info.h" -#include "orte/runtime/orte_globals.h" -#include "opal/dss/dss.h" -#include "orte/mca/rml/rml.h" -#include "orte/mca/rml/rml_types.h" -#include "orte/mca/routed/routed.h" -#include "orte/mca/iof/iof.h" -#include "orte/mca/plm/plm.h" -#include "orte/mca/plm/base/base.h" -#include "orte/mca/plm/base/plm_private.h" -#include "orte/mca/filem/filem.h" -#include "orte/mca/grpcomm/grpcomm.h" -#include "orte/runtime/orte_wait.h" -#include "orte/mca/rmaps/rmaps_types.h" -#include "orte/mca/snapc/snapc.h" -#include "orte/mca/snapc/base/base.h" -#include "orte/mca/sstore/sstore.h" -#include "orte/mca/sstore/base/base.h" - -#include "orte/mca/errmgr/errmgr.h" -#include "orte/mca/errmgr/base/base.h" -#include "orte/mca/errmgr/base/errmgr_private.h" - -#include "errmgr_hnp.h" - -#include MCA_timer_IMPLEMENTATION_HEADER - -#if OPAL_ENABLE_FT_CR -/************************ - * Work Pool structures - ************************/ -struct errmgr_autor_wp_item_t { - /** List super object */ - opal_list_item_t super; - - /** ORTE Process name */ - orte_process_name_t name; - - /** State that was passed with it */ - orte_proc_state_t state; -}; -typedef struct errmgr_autor_wp_item_t errmgr_autor_wp_item_t; - -OBJ_CLASS_DECLARATION(errmgr_autor_wp_item_t); - -void errmgr_autor_wp_item_construct(errmgr_autor_wp_item_t *wp); -void errmgr_autor_wp_item_destruct(errmgr_autor_wp_item_t *wp); - -OBJ_CLASS_INSTANCE(errmgr_autor_wp_item_t, - opal_list_item_t, - errmgr_autor_wp_item_construct, - errmgr_autor_wp_item_destruct); - -/************************************ - * Locally Global vars & functions :) - ************************************/ -static orte_jobid_t current_global_jobid = ORTE_JOBID_INVALID; -static orte_job_t *current_global_jobdata = NULL; - -static bool autor_mask_faults = false; - -static opal_list_t *procs_pending_recovery = NULL; -static bool autor_timer_active = false; -static opal_event_t *autor_timer_event = NULL; - -static void errmgr_autor_recover_processes(int fd, short event, void *cbdata); -static int autor_set_current_job_info(orte_job_t *given_jdata, orte_process_name_t *proc_name); - -static int display_procs(void ); -static int autor_procs_sort_compare_fn(opal_list_item_t **a, - opal_list_item_t **b); - -static int orte_errmgr_hnp_autor_global_process_fault(orte_job_t *jdata, - orte_process_name_t *proc_name, - orte_proc_state_t state); -static void errmgr_autor_process_fault_app(orte_job_t *jdata, - orte_process_name_t *proc, - orte_proc_state_t state); -static void errmgr_autor_process_fault_daemon(orte_job_t *jdata, - orte_process_name_t *proc, - orte_proc_state_t state); - -static int check_if_terminated(opal_pointer_array_t *procs); -static int check_if_restarted(opal_pointer_array_t *procs); - -/* - * Timer stuff - */ -static void errmgr_autor_set_time(int idx); -static void errmgr_autor_display_all_timers(void); -static void errmgr_autor_clear_timers(void); - -static double errmgr_autor_get_time(void); -static void errmgr_autor_display_indv_timer_core(double diff, char *str); -static double timer_start[OPAL_CR_TIMER_MAX]; - -#define ERRMGR_AUTOR_TIMER_START 0 -#define ERRMGR_AUTOR_TIMER_SETUP 1 -#define ERRMGR_AUTOR_TIMER_TERM 2 -#define ERRMGR_AUTOR_TIMER_RESETUP 3 -#define ERRMGR_AUTOR_TIMER_RESTART 4 -#define ERRMGR_AUTOR_TIMER_FINISH 5 -#define ERRMGR_AUTOR_TIMER_MAX 6 - -#define ERRMGR_AUTOR_CLEAR_TIMERS() \ - { \ - if(OPAL_UNLIKELY(mca_errmgr_hnp_component.autor_timing_enabled > 0)) { \ - errmgr_autor_clear_timers(); \ - } \ - } - -#define ERRMGR_AUTOR_SET_TIMER(idx) \ - { \ - if(OPAL_UNLIKELY(mca_errmgr_hnp_component.autor_timing_enabled > 0)) { \ - errmgr_autor_set_time(idx); \ - } \ - } - -#define ERRMGR_AUTOR_DISPLAY_ALL_TIMERS() \ - { \ - if(OPAL_UNLIKELY(mca_errmgr_hnp_component.autor_timing_enabled > 0)) { \ - errmgr_autor_display_all_timers(); \ - } \ - } - -/************************ - * Function Definitions: Global - ************************/ -int orte_errmgr_hnp_autor_global_module_init(void) -{ - opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(autor):init()"); - - procs_pending_recovery = OBJ_NEW(opal_list_t); - - current_global_jobid = ORTE_JOBID_INVALID; - current_global_jobdata = NULL; - - if( NULL == autor_timer_event ) { - autor_timer_event = opal_event_evtimer_new(opal_event_base, errmgr_autor_recover_processes, NULL); - } - - ERRMGR_AUTOR_CLEAR_TIMERS(); - - return ORTE_SUCCESS; -} - -int orte_errmgr_hnp_autor_global_module_finalize(void) -{ - opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(autor):finalize()"); - - if( NULL != procs_pending_recovery ) { - OBJ_RELEASE(procs_pending_recovery); - procs_pending_recovery = NULL; - } - if( NULL != autor_timer_event ) { - free(autor_timer_event); - autor_timer_event = NULL; - } - - current_global_jobid = ORTE_JOBID_INVALID; - current_global_jobdata = NULL; - - ERRMGR_AUTOR_CLEAR_TIMERS(); - - return ORTE_SUCCESS; -} - -static int autor_set_current_job_info(orte_job_t *given_jdata, orte_process_name_t *proc_name) -{ - orte_job_t *jdata = NULL; - int i; - - /* - * If we already figured it out, then just move ahead - */ - if( NULL != current_global_jobdata ) { - if( given_jdata->jobid != ORTE_PROC_MY_NAME->jobid && - given_jdata->jobid != current_global_jobdata->jobid ) { - current_global_jobdata = given_jdata; - current_global_jobid = given_jdata->jobid; - } - return ORTE_SUCCESS; - } - - /* - * If this references the application, and not the daemons - */ - if( given_jdata->jobid != ORTE_PROC_MY_NAME->jobid ) { - current_global_jobdata = given_jdata; - current_global_jobid = given_jdata->jobid; - return ORTE_SUCCESS; - } - - /* - * Otherwise iterate through the job structure and find the first job. - */ - for(i = 0; i < orte_job_data->size; ++i ) { - if (NULL == (jdata = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, i))) { - continue; - } - /* Exclude outselves */ - if( jdata->jobid == ORTE_PROC_MY_NAME->jobid ) { - continue; - } - current_global_jobdata = jdata; - current_global_jobid = jdata->jobid; - break; - } - - if( NULL == current_global_jobdata ) { - opal_output(0, "errmgr:hnp(autor):process_fault(): Global) Error: Cannot find the jdata for the current job."); - return ORTE_ERROR; - } - - return ORTE_SUCCESS; -} - -int orte_errmgr_hnp_autor_global_update_state(orte_jobid_t job, - orte_job_state_t jobstate, - orte_process_name_t *proc_name, - orte_proc_state_t state, - pid_t pid, - orte_exit_code_t exit_code) -{ - orte_proc_t *loc_proc = NULL; - orte_job_t *jdata = NULL; - int ret = ORTE_SUCCESS, exit_status = ORTE_SUCCESS; - int32_t i; - - /* - * if orte is trying to shutdown, just let it - */ - if( mca_errmgr_hnp_component.term_in_progress ) { - return ORTE_SUCCESS; - } - - if( NULL != proc_name && - OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_NAME, proc_name) ) { - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "%s errmgr:hnp(autor): Update reported on self (%s), state %s. Skip...", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(proc_name), - orte_proc_state_to_str(state) )); - return ORTE_SUCCESS; - } - - /* - * Get the job data object for this process - */ - if( NULL != proc_name ) { /* Get job from proc's jobid */ - jdata = orte_get_job_data_object(proc_name->jobid); - } else { /* Get from the general job */ - jdata = orte_get_job_data_object(job); - } - if( NULL == jdata ) { - opal_output(0, "%s errmgr:hnp(autor):update_state() Error: Cannot find job %s for Process %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(job), - (NULL == proc_name) ? "NULL" : ORTE_NAME_PRINT(proc_name) ); - ret = ORTE_ERROR; - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - - /* - * If this is a tool, ignore - */ - if( jdata->num_apps == 0 && - OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_JOBID, ORTE_PROC_MY_NAME, proc_name) ) { - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "%s errmgr:hnp(autor): An external tool disconnected. Ignore...", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - exit_status = ORTE_SUCCESS; - goto cleanup; - } - - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "%s errmgr:hnp(autor): job %s reported state %s" - " for proc %s state %s exit_code %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(job), - orte_job_state_to_str(jobstate), - (NULL == proc_name) ? "NULL" : ORTE_NAME_PRINT(proc_name), - orte_proc_state_to_str(state), exit_code)); - - if( ORTE_JOB_STATE_RESTART == jobstate ) { - for(i = 0; i < jdata->procs->size; ++i) { - if (NULL == (loc_proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) { - continue; - } - break; - } - - if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_autor_global_process_fault(jdata, &(loc_proc->name), state)) ) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - } - else if( ORTE_PROC_STATE_ABORTED_BY_SIG == state || - ORTE_PROC_STATE_COMM_FAILED == state ) { - if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_autor_global_process_fault(jdata, proc_name, state)) ) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - } - else if( ORTE_PROC_STATE_KILLED_BY_CMD == state ) { - if( autor_mask_faults ) { - mca_errmgr_hnp_component.ignore_current_update = true; - orte_errmgr_hnp_update_proc(jdata, proc_name, state, 0, exit_code); - } - } - - cleanup: - return ret; -} - -static int orte_errmgr_hnp_autor_global_process_fault(orte_job_t *jdata, - orte_process_name_t *proc_name, - orte_proc_state_t state) -{ - int ret; - - /* - * Recover from the process failure by relaunching. - */ - if( ORTE_SUCCESS != (ret = autor_set_current_job_info(jdata, proc_name)) ) { - ORTE_ERROR_LOG(ret); - return ORTE_SUCCESS; /* JJH: Do this for now. Need to fix the flag for normal shutdown */ - /*return ret;*/ - } - - current_global_jobdata->controls |= ORTE_JOB_CONTROL_RECOVERABLE; - - if( proc_name->jobid == ORTE_PROC_MY_NAME->jobid ) { - errmgr_autor_process_fault_daemon(jdata, proc_name, state); - } else { - orte_errmgr_hnp_update_proc(jdata, proc_name, state, 0, 0); - errmgr_autor_process_fault_app(jdata, proc_name, state); - } - - return ORTE_SUCCESS; -} - -int orte_errmgr_hnp_autor_global_suggest_map_targets(orte_proc_t *proc, - orte_node_t *oldnode, - opal_list_t *node_list) -{ - opal_list_item_t *item = NULL; - errmgr_autor_wp_item_t *wp_item = NULL; - orte_node_t *node = NULL; - bool found = false; - int num_removed = 0, num_to_remove; - orte_ns_cmp_bitmask_t mask; - - if( NULL == current_global_jobdata ) { - return ORTE_SUCCESS; - } - - /* JJH Nasty Hack */ - num_to_remove = current_global_jobdata->num_procs / 2; - num_to_remove += 1; - - /* - * Find this process in the known failures list - */ - found = false; - if( mca_errmgr_hnp_component.autor_skip_oldnode ) { - for(item = opal_list_get_first(procs_pending_recovery); - item != opal_list_get_end(procs_pending_recovery); - item = opal_list_get_next(item) ) { - wp_item = (errmgr_autor_wp_item_t*)item; - - mask = ORTE_NS_CMP_ALL; - if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &wp_item->name, &proc->name)) { - found = true; - break; - } - } - } - - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "%s errmgr:hnp(autor): suggest_map() " - "Process remapping: %s oldnode %s, %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&proc->name), - oldnode->name, - (found ? "Failed Proc." : "Good Proc.") )); - - /* - * If not a failed process, then return it to the oldnode - * If failed process, do not place it back on the same node - */ - num_removed = 0; - for( item = opal_list_get_first(node_list); - item != opal_list_get_end(node_list); - item = opal_list_get_next(item) ) { - node = (orte_node_t*)item; - if( found ) { - if( num_removed >= num_to_remove ) { - break; - } - /* JJH Nasty Hack */ -#if 0 - /* Remove oldnode (if more than one node) */ - if( node == oldnode && 1 < opal_list_get_size(node_list) ) { - opal_output(0, "JJH Remove Node (%s)", node->name); - opal_list_remove_item(node_list, item); - OBJ_RELEASE(item); - } -#else - if( 1 < opal_list_get_size(node_list) ) { - opal_list_remove_item(node_list, item); - OBJ_RELEASE(item); - } -#endif - num_removed++; - } else { - /* Stay on same node */ - if( node != oldnode ) { - opal_list_remove_item(node_list, item); - OBJ_RELEASE(item); - } - } - } - - return ORTE_SUCCESS; -} - -int orte_errmgr_hnp_autor_global_ft_event(int state) -{ - return ORTE_SUCCESS; -} - - -/***************** - * Local Functions - *****************/ -static void errmgr_autor_process_fault_app(orte_job_t *jdata, - orte_process_name_t *proc, - orte_proc_state_t state) -{ - errmgr_autor_wp_item_t *wp_item = NULL; - struct timeval soon; - - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "%s errmgr:hnp(autor): process_fault() " - "Process fault! proc %s (0x%x)", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(proc), - state)); - - if( !orte_sstore_base_is_checkpoint_available ) { - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "%s errmgr:hnp(autor): process_fault() " - "No checkpoints are available for this job! Cannot Automaticly Recover!", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME) )); - opal_show_help("help-orte-errmgr-hnp.txt", "autor_failed_to_recover_proc", true, - ORTE_NAME_PRINT(proc), proc->vpid); - return; - } - - mca_errmgr_hnp_component.ignore_current_update = true; - - /* - * If we are already in the shutdown stage of the recovery, then just skip it - */ - if( autor_mask_faults ) { - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "%s errmgr:hnp(autor):process_fault() " - "Currently recovering the job. Failure masked!", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - return; - } - - /* - * Append this process to the list to process - */ - wp_item = OBJ_NEW(errmgr_autor_wp_item_t); - wp_item->name.jobid = proc->jobid; - wp_item->name.vpid = proc->vpid; - ORTE_EPOCH_SET(wp_item->name.epoch,proc->epoch); - wp_item->state = state; - - opal_list_append(procs_pending_recovery, &(wp_item->super)); - - /* - * Activate the timer, if it is not already setup - */ - if( !autor_timer_active ) { - autor_timer_active = true; - - opal_event_evtimer_set(opal_event_base, autor_timer_event, errmgr_autor_recover_processes, NULL); - soon.tv_sec = mca_errmgr_hnp_component.autor_recovery_delay; - soon.tv_usec = 0; - opal_event_evtimer_add(autor_timer_event, &soon); - } - - return; -} - -static void errmgr_autor_process_fault_daemon(orte_job_t *jdata, - orte_process_name_t *proc, - orte_proc_state_t state) -{ - orte_proc_t *loc_proc = NULL, *child_proc = NULL; - orte_std_cntr_t i_proc; - int32_t i; - - OPAL_OUTPUT_VERBOSE((15, mca_errmgr_hnp_component.super.output_handle, - "%s errmgr:hnp(autor): process_fault_daemon() " - "------- Daemon fault reported! proc %s (0x%x)", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(proc), - state)); - - /* - * Set the process state in the job data structure - */ - for(i = 0; i < jdata->procs->size; ++i) { - if (NULL == (loc_proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) { - continue; - } - - if( loc_proc->name.vpid != proc->vpid) { - continue; - } - - loc_proc->state = state; - - break; - } - - /* - * Remove the route to this process - */ - orte_routed.delete_route(proc); - - /* - * If the aborted daemon had active processes on its node, then we should - * make sure to signal that all the children are gone. - */ - if( loc_proc->node->num_procs > 0 ) { - OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output, - "%s errmgr:base: stabalize_runtime() " - "------- Daemon lost with the following processes", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - for(i_proc = 0; i_proc < opal_pointer_array_get_size(loc_proc->node->procs); ++i_proc) { - child_proc = (orte_proc_t*)opal_pointer_array_get_item(loc_proc->node->procs, i_proc); - if( NULL == child_proc ) { - continue; - } - - OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output, - "%s errmgr:base: stabalize_runtime() " - "\t %s [0x%x]", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&child_proc->name), - child_proc->state)); - - if( child_proc->last_errmgr_state < child_proc->state ) { - child_proc->last_errmgr_state = child_proc->state; - orte_errmgr.update_state(child_proc->name.jobid, ORTE_JOB_STATE_COMM_FAILED, - &(child_proc->name), ORTE_PROC_STATE_COMM_FAILED, - 0, 1); - } - } - } else { - /* This daemon had no children, so just mask the failure */ - mca_errmgr_hnp_component.ignore_current_update = true; - } - - /* - * Record the dead daemon - */ - orte_errmgr_hnp_record_dead_process(proc); - - return; -} - -void errmgr_autor_wp_item_construct(errmgr_autor_wp_item_t *wp) -{ - wp->name.jobid = ORTE_JOBID_INVALID; - wp->name.vpid = ORTE_VPID_INVALID; - ORTE_EPOCH_SET(wp->name.epoch,ORTE_EPOCH_MIN); - - wp->state = 0; -} - -void errmgr_autor_wp_item_destruct(errmgr_autor_wp_item_t *wp) -{ - wp->name.jobid = ORTE_JOBID_INVALID; - wp->name.vpid = ORTE_VPID_INVALID; - ORTE_EPOCH_SET(wp->name.epoch,ORTE_EPOCH_INVALID); - - wp->state = 0; -} - -static int display_procs(void ) -{ - opal_list_item_t *item = NULL; - errmgr_autor_wp_item_t *wp_item = NULL; - char *proc_str = NULL; - char *tmp_str = NULL; - - for(item = opal_list_get_first(procs_pending_recovery); - item != opal_list_get_end(procs_pending_recovery); - item = opal_list_get_next(item) ) { - wp_item = (errmgr_autor_wp_item_t*)item; - - if( NULL == proc_str ) { - asprintf(&proc_str, "\t%s Rank %d\n", - ORTE_NAME_PRINT(&(wp_item->name)), - (int)wp_item->name.vpid); - } else { - tmp_str = strdup(proc_str); - free(proc_str); - proc_str = NULL; - asprintf(&proc_str, "%s\t%s Rank %d\n", - tmp_str, - ORTE_NAME_PRINT(&(wp_item->name)), - (int)wp_item->name.vpid); - } - } - - opal_show_help("help-orte-errmgr-hnp.txt", "autor_recovering_job", true, - proc_str); - - if( NULL != tmp_str ) { - free(tmp_str); - tmp_str = NULL; - } - - if( NULL != proc_str ) { - free(proc_str); - proc_str = NULL; - } - - return ORTE_SUCCESS; -} - -static int autor_procs_sort_compare_fn(opal_list_item_t **a, - opal_list_item_t **b) -{ - errmgr_autor_wp_item_t *wp_a, *wp_b; - - wp_a = (errmgr_autor_wp_item_t*)(*a); - wp_b = (errmgr_autor_wp_item_t*)(*b); - - if( wp_a->name.vpid > wp_b->name.vpid ) { - return 1; - } - else if( wp_a->name.vpid == wp_b->name.vpid ) { - return 0; - } - else { - return -1; - } -} - -static void errmgr_autor_recover_processes(int fd, short event, void *cbdata) -{ - int ret, exit_status = ORTE_SUCCESS; - opal_list_item_t *item = NULL; - errmgr_autor_wp_item_t *wp_item = NULL; - orte_std_cntr_t i_proc; - orte_proc_t *proc = NULL; - orte_sstore_base_global_snapshot_info_t *snapshot = NULL; - char * tmp_str = NULL; - - autor_mask_faults = true; - ERRMGR_AUTOR_CLEAR_TIMERS(); - ERRMGR_AUTOR_SET_TIMER(ERRMGR_AUTOR_TIMER_START); - - /* - * Display the processes that are to be recovered - */ - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "%s errmgr:hnp(autor):recover() " - "------- Display known failed processes in the job %s -------", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(current_global_jobdata->jobid))); - - opal_list_sort(procs_pending_recovery, autor_procs_sort_compare_fn); - display_procs(); - - /* - * Find the latest checkpoint - */ - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "%s errmgr:hnp(autor):recover() " - "------- Find the latest checkpoint for the job %s -------", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(current_global_jobdata->jobid))); - - snapshot = OBJ_NEW(orte_sstore_base_global_snapshot_info_t); - if( ORTE_SUCCESS != (ret = orte_sstore.request_global_snapshot_data(&orte_sstore_handle_last_stable, snapshot)) ) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - - ERRMGR_AUTOR_SET_TIMER(ERRMGR_AUTOR_TIMER_SETUP); - - /* - * Safely terminate the entire job - */ - opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(autor):recover() " - "------- Safely terminate the job %s -------", - ORTE_JOBID_PRINT(current_global_jobdata->jobid)); - - for(i_proc = 0; i_proc < opal_pointer_array_get_size(current_global_jobdata->procs); ++i_proc) { - proc = (orte_proc_t*)opal_pointer_array_get_item(current_global_jobdata->procs, i_proc); - if( NULL == proc ) { - continue; - } - if( proc->state < ORTE_PROC_STATE_UNTERMINATED ) { - proc->state = ORTE_PROC_STATE_MIGRATING; - } - if( current_global_jobdata->stdin_target == proc->name.vpid ) { - orte_iof.close(&(proc->name), ORTE_IOF_STDIN); - } - } - - orte_plm.terminate_procs(current_global_jobdata->procs); - - /* - * Wait for the job to terminate all processes - */ - while(!check_if_terminated(current_global_jobdata->procs) ) { - opal_progress(); - } - - ERRMGR_AUTOR_SET_TIMER(ERRMGR_AUTOR_TIMER_TERM); - - opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(autor):recover() " - "------- Done waiting for termination of job %s -------", - ORTE_JOBID_PRINT(current_global_jobdata->jobid)); - current_global_jobdata->num_terminated = current_global_jobdata->num_procs; - orte_plm_base_reset_job(current_global_jobdata); - - /* - * Construct the app contexts to restart - */ - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "%s errmgr:hnp(autor):recover() " - "------- Rebuild job %s app context -------", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(current_global_jobdata->jobid))); - for(i_proc = 0; i_proc < opal_pointer_array_get_size(current_global_jobdata->procs); ++i_proc) { - proc = (orte_proc_t*)opal_pointer_array_get_item(current_global_jobdata->procs, i_proc); - if( NULL == proc ) { - continue; - } - - if( ORTE_SUCCESS != (ret = orte_errmgr_base_update_app_context_for_cr_recovery(current_global_jobdata, - proc, - &(snapshot->local_snapshots))) ) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "\tAdjusted: \"%s\" [0x%d] [%s]\n", - ORTE_NAME_PRINT(&proc->name), proc->state, proc->node->name)); - } - - ERRMGR_AUTOR_SET_TIMER(ERRMGR_AUTOR_TIMER_RESETUP); - - /* - * Spawn the restarted job - */ - opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(autor):recover() " - "------- Respawning the job %s -------", - ORTE_JOBID_PRINT(current_global_jobdata->jobid)); - orte_snapc_base_has_recovered = false; - autor_mask_faults = false; /* Failures pass this point are worth noting */ - orte_plm.spawn(current_global_jobdata); - - /* - * Wait for all the processes to restart - */ - opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(autor):recover() " - "------- Waiting for restart -------"); - while(!check_if_restarted(current_global_jobdata->procs) ) { - opal_progress(); - } - - ERRMGR_AUTOR_SET_TIMER(ERRMGR_AUTOR_TIMER_RESTART); - - /* - * All done - */ - while( !orte_snapc_base_has_recovered ) { - opal_progress(); - } - - opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(autor):recover() " - "------- Finished recovering job %s -------", - ORTE_JOBID_PRINT(current_global_jobdata->jobid)); - - opal_show_help("help-orte-errmgr-hnp.txt", "autor_recovery_complete", true); - - ERRMGR_AUTOR_SET_TIMER(ERRMGR_AUTOR_TIMER_FINISH); - - cleanup: - while(NULL != (item = opal_list_remove_first(procs_pending_recovery))) { - wp_item = (errmgr_autor_wp_item_t*)item; - OBJ_RELEASE(wp_item); - } - - if( NULL != tmp_str ) { - free(tmp_str); - tmp_str = NULL; - } - - ERRMGR_AUTOR_DISPLAY_ALL_TIMERS(); - - autor_timer_active = false; - autor_mask_faults = false; - - return; -} - -static int check_if_terminated(opal_pointer_array_t *procs) -{ - orte_std_cntr_t i_proc; - orte_proc_t *proc = NULL; - bool is_done; - - if( NULL == procs ){ - return true; - } - - is_done = true; - for(i_proc = 0; i_proc < opal_pointer_array_get_size(procs); ++i_proc) { - proc = (orte_proc_t*)opal_pointer_array_get_item(procs, i_proc); - if( NULL == proc ) { - continue; - } - - if( proc->state < ORTE_PROC_STATE_UNTERMINATED || - proc->state == ORTE_PROC_STATE_MIGRATING ) { - is_done = false; - break; - } - } - - if( !is_done ) { - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "\t Still waiting for termination: \"%s\" [0x%x] < [0x%x]\n", - ORTE_NAME_PRINT(&proc->name), proc->state, ORTE_PROC_STATE_UNTERMINATED)); - } - - return is_done; -} - -static int check_if_restarted(opal_pointer_array_t *procs) -{ - orte_std_cntr_t i_proc; - orte_proc_t *proc = NULL; - bool is_done; - - if( NULL == procs ){ - return true; - } - - is_done = true; - for(i_proc = 0; i_proc < opal_pointer_array_get_size(procs); ++i_proc) { - proc = (orte_proc_t*)opal_pointer_array_get_item(procs, i_proc); - if( NULL == proc ) { - continue; - } - - if( !(ORTE_PROC_STATE_RUNNING & proc->state) ) { - is_done = false; - break; - } - } - - if( !is_done ) { - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "\t Still waiting for restart: \"%s\" [0x%x] != [0x%x]\n", - ORTE_NAME_PRINT(&proc->name), proc->state, ORTE_PROC_STATE_RUNNING)); - } - - return is_done; -} - -/************************ - * Timing - ************************/ -static void errmgr_autor_set_time(int idx) -{ - if(idx < ERRMGR_AUTOR_TIMER_MAX ) { - if( timer_start[idx] <= 0.0 ) { - timer_start[idx] = errmgr_autor_get_time(); - } - } -} - -static void errmgr_autor_display_all_timers(void) -{ - double diff = 0.0; - char * label = NULL; - - opal_output(0, "Auto. Recovery Timing: ******************** Summary Begin\n"); - - /********** Structure Setup **********/ - label = strdup("Setup"); - diff = timer_start[ERRMGR_AUTOR_TIMER_SETUP] - timer_start[ERRMGR_AUTOR_TIMER_START]; - errmgr_autor_display_indv_timer_core(diff, label); - free(label); - - /********** Termination **********/ - label = strdup("Terminate"); - diff = timer_start[ERRMGR_AUTOR_TIMER_TERM] - timer_start[ERRMGR_AUTOR_TIMER_SETUP]; - errmgr_autor_display_indv_timer_core(diff, label); - free(label); - - /********** Setup new job **********/ - label = strdup("Setup Relaunch"); - diff = timer_start[ERRMGR_AUTOR_TIMER_RESETUP] - timer_start[ERRMGR_AUTOR_TIMER_TERM]; - errmgr_autor_display_indv_timer_core(diff, label); - free(label); - - /********** Restart **********/ - label = strdup("Restart"); - diff = timer_start[ERRMGR_AUTOR_TIMER_RESTART] - timer_start[ERRMGR_AUTOR_TIMER_RESETUP]; - errmgr_autor_display_indv_timer_core(diff, label); - free(label); - - /********** Finish **********/ - label = strdup("Finalize"); - diff = timer_start[ERRMGR_AUTOR_TIMER_FINISH] - timer_start[ERRMGR_AUTOR_TIMER_RESTART]; - errmgr_autor_display_indv_timer_core(diff, label); - free(label); - - opal_output(0, "Auto. Recovery Timing: ******************** Summary End\n"); -} - -static void errmgr_autor_clear_timers(void) -{ - int i; - for(i = 0; i < ERRMGR_AUTOR_TIMER_MAX; ++i) { - timer_start[i] = 0.0; - } -} - -static double errmgr_autor_get_time(void) -{ - double wtime; - -#if OPAL_TIMER_USEC_NATIVE - wtime = (double)opal_timer_base_get_usec() / 1000000.0; -#else - struct timeval tv; - gettimeofday(&tv, NULL); - wtime = tv.tv_sec; - wtime += (double)tv.tv_usec / 1000000.0; -#endif - - return wtime; -} - -static void errmgr_autor_display_indv_timer_core(double diff, char *str) -{ - double total = 0; - double perc = 0; - - total = timer_start[ERRMGR_AUTOR_TIMER_MAX-1] - timer_start[ERRMGR_AUTOR_TIMER_START]; - perc = (diff/total) * 100; - - opal_output(0, - "errmgr_autor: timing: %-20s = %10.2f s\t%10.2f s\t%6.2f\n", - str, - diff, - total, - perc); - return; -} - -#endif /* OPAL_ENABLE_FT_CR */ diff --git a/orte/mca/errmgr/hnp/errmgr_hnp_component.c b/orte/mca/errmgr/hnp/errmgr_hnp_component.c deleted file mode 100644 index 3d4ad91ed4..0000000000 --- a/orte/mca/errmgr/hnp/errmgr_hnp_component.c +++ /dev/null @@ -1,201 +0,0 @@ -/* - * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. - * - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "orte_config.h" -#include "opal/util/output.h" - -#include "orte/mca/errmgr/errmgr.h" -#include "orte/mca/errmgr/base/base.h" -#include "orte/mca/errmgr/base/errmgr_private.h" -#include "errmgr_hnp.h" - -/* - * Public string for version number - */ -const char *orte_errmgr_hnp_component_version_string = - "ORTE ERRMGR hnp MCA component version " ORTE_VERSION; - -/* - * Local functionality - */ -static int orte_errmgr_hnp_open(void); -static int orte_errmgr_hnp_close(void); - -/* - * Instantiate the public struct with all of our public information - * and pointer to our public functions in it - */ -orte_errmgr_hnp_component_t mca_errmgr_hnp_component = { - /* First do the base component stuff */ - { - /* Handle the general mca_component_t struct containing - * meta information about the component hnp - */ - { - ORTE_ERRMGR_BASE_VERSION_3_0_0, - /* Component name and version */ - "hnp", - ORTE_MAJOR_VERSION, - ORTE_MINOR_VERSION, - ORTE_RELEASE_VERSION, - - /* Component open and close functions */ - orte_errmgr_hnp_open, - orte_errmgr_hnp_close, - orte_errmgr_hnp_component_query - }, - { - /* The component is checkpoint ready */ - MCA_BASE_METADATA_PARAM_CHECKPOINT - }, - - /* Verbosity level */ - 0, - /* opal_output handler */ - -1, - /* Default priority */ - 5 - } -}; - -static int orte_errmgr_hnp_open(void) -{ - int val; - - /* - * This should be the last componet to ever get used since - * it doesn't do anything. - */ - mca_base_param_reg_int(&mca_errmgr_hnp_component.super.base_version, - "priority", - "Priority of the ERRMGR hnp component", - false, false, - mca_errmgr_hnp_component.super.priority, - &mca_errmgr_hnp_component.super.priority); - - mca_base_param_reg_int(&mca_errmgr_hnp_component.super.base_version, - "verbose", - "Verbose level for the ERRMGR hnp component", - false, false, - mca_errmgr_hnp_component.super.verbose, - &mca_errmgr_hnp_component.super.verbose); - /* If there is a custom verbose level for this component than use it - * otherwise take our parents level and output channel - */ - if ( 0 != mca_errmgr_hnp_component.super.verbose) { - mca_errmgr_hnp_component.super.output_handle = opal_output_open(NULL); - opal_output_set_verbosity(mca_errmgr_hnp_component.super.output_handle, - mca_errmgr_hnp_component.super.verbose); - } else { - mca_errmgr_hnp_component.super.output_handle = orte_errmgr_base.output; - } - -#if OPAL_ENABLE_FT_CR - /**************************** - * CRMig (C/R Process Migration) MCA Options - ****************************/ - mca_base_param_reg_int(&mca_errmgr_hnp_component.super.base_version, - "crmig_timing", - "Enable Process Migration timer", - false, false, - 0, &val); - mca_errmgr_hnp_component.crmig_timing_enabled = OPAL_INT_TO_BOOL(val); - - mca_base_param_reg_int(&mca_errmgr_hnp_component.super.base_version, - "crmig_enable", - "Enable Process Migration (Default: 0/off)", - false, false, - 0, &val); - mca_errmgr_hnp_component.crmig_enabled = OPAL_INT_TO_BOOL(val); - - /**************************** - * AutoR (Automatic Recovery) MCA Options - ****************************/ - mca_base_param_reg_int(&mca_errmgr_hnp_component.super.base_version, - "autor_timing", - "Enable Automatic Recovery timer", - false, false, - 0, &val); - mca_errmgr_hnp_component.autor_timing_enabled = OPAL_INT_TO_BOOL(val); - - mca_base_param_reg_int(&mca_errmgr_hnp_component.super.base_version, - "autor_enable", - "Enable Automatic Recovery (Default: 0/off)", - false, false, - 0, &val); - mca_errmgr_hnp_component.autor_enabled = OPAL_INT_TO_BOOL(val); - - mca_base_param_reg_int(&mca_errmgr_hnp_component.super.base_version, - "autor_recovery_delay", - "Number of seconds to wait before starting to recover the job after a failure" - " [Default: 1 sec]", - false, false, - 1, &val); - mca_errmgr_hnp_component.autor_recovery_delay = val; - - mca_base_param_reg_int(&mca_errmgr_hnp_component.super.base_version, - "autor_skip_oldnode", - "Skip the old node from failed proc, even if it is still available" - " [Default: Enabled]", - false, false, - 1, &val); - mca_errmgr_hnp_component.autor_skip_oldnode = OPAL_INT_TO_BOOL(val); -#else - val = 0; /* Silence compiler warning */ -#endif /* OPAL_ENABLE_FT_CR */ - - /* - * Debug Output - */ - opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp: open()"); - opal_output_verbose(20, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp: open: priority = %d", - mca_errmgr_hnp_component.super.priority); - opal_output_verbose(20, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp: open: verbosity = %d", - mca_errmgr_hnp_component.super.verbose); -#if OPAL_ENABLE_FT_CR - opal_output_verbose(20, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp: open: --- CR Migration Options ---"); - opal_output_verbose(20, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp: open: Process Migration = %s", - (mca_errmgr_hnp_component.crmig_enabled ? "Enabled" : "Disabled")); - opal_output_verbose(20, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp: open: timing = %s", - (mca_errmgr_hnp_component.crmig_timing_enabled ? "Enabled" : "Disabled")); - - opal_output_verbose(20, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp: open: --- Auto. Recovery Options ---"); - opal_output_verbose(20, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp: open: Auto. Recover = %s", - (mca_errmgr_hnp_component.autor_enabled ? "Enabled" : "Disabled")); - opal_output_verbose(20, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp: open: timing = %s", - (mca_errmgr_hnp_component.autor_timing_enabled ? "Enabled" : "Disabled")); - opal_output_verbose(20, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp: open: recover_delay = %d", - mca_errmgr_hnp_component.autor_recovery_delay); - - mca_errmgr_hnp_component.crmig_in_progress = false; - mca_errmgr_hnp_component.autor_in_progress = false; - mca_errmgr_hnp_component.term_in_progress = false; -#endif /* OPAL_ENABLE_FT_CR */ - - return ORTE_SUCCESS; -} - -static int orte_errmgr_hnp_close(void) -{ - opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp: close()"); - - return ORTE_SUCCESS; -} diff --git a/orte/mca/errmgr/hnp/errmgr_hnp_crmig.c b/orte/mca/errmgr/hnp/errmgr_hnp_crmig.c deleted file mode 100644 index 8698f959d4..0000000000 --- a/orte/mca/errmgr/hnp/errmgr_hnp_crmig.c +++ /dev/null @@ -1,1517 +0,0 @@ -/* - * Copyright (c) 2009-2010 The Trustees of Indiana University. - * All rights reserved. - * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. - * Copyright (c) 2004-2011 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "orte_config.h" - -#include -#ifdef HAVE_UNISTD_H -#include -#endif /* HAVE_UNISTD_H */ -#ifdef HAVE_STRING_H -#include -#endif - -#include "opal/util/show_help.h" -#include "opal/util/output.h" -#include "opal/util/opal_environ.h" -#include "opal/util/basename.h" -#include "opal/util/argv.h" -#include "opal/mca/mca.h" -#include "opal/mca/base/base.h" -#include "opal/mca/base/mca_base_param.h" -#include "opal/mca/crs/crs.h" -#include "opal/mca/crs/base/base.h" - -#include "orte/util/error_strings.h" -#include "orte/util/name_fns.h" -#include "orte/util/proc_info.h" -#include "orte/runtime/orte_globals.h" -#include "opal/dss/dss.h" -#include "orte/mca/rml/rml.h" -#include "orte/mca/rml/rml_types.h" -#include "orte/mca/iof/iof.h" -#include "orte/mca/plm/plm.h" -#include "orte/mca/plm/base/base.h" -#include "orte/mca/plm/base/plm_private.h" -#include "orte/mca/filem/filem.h" -#include "orte/mca/grpcomm/grpcomm.h" -#include "orte/runtime/orte_wait.h" -#include "orte/mca/rmaps/rmaps_types.h" -#include "orte/mca/routed/routed.h" -#include "orte/mca/snapc/snapc.h" -#include "orte/mca/snapc/base/base.h" - -#include "orte/mca/errmgr/errmgr.h" -#include "orte/mca/errmgr/base/base.h" -#include "orte/mca/errmgr/base/errmgr_private.h" - -#include "errmgr_hnp.h" - -#include MCA_timer_IMPLEMENTATION_HEADER - -#if OPAL_ENABLE_FT_CR - -/************************************ - * Locally Global vars & functions :) - ************************************/ -static orte_jobid_t current_global_jobid = ORTE_JOBID_INVALID; -static orte_job_t *current_global_jobdata = NULL; - -static bool migrating_underway = false; -static bool migrating_terminated = false; -static bool migrating_restarted = false; - -static opal_list_t *current_onto_mapping_general = NULL; -static opal_list_t *current_onto_mapping_exclusive = NULL; - -/*** Command Line Interactions */ -static int current_migration_status = ORTE_ERRMGR_MIGRATE_STATE_NONE; - -static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_nodes, opal_list_t *onto_map); - -static int orte_errmgr_hnp_crmig_global_process_fault(orte_job_t *jdata, - orte_process_name_t *proc_name, - orte_proc_state_t state); -static void errmgr_crmig_process_fault_app(orte_job_t *jdata, - orte_process_name_t *proc, - orte_proc_state_t state); -static void errmgr_crmig_process_fault_daemon(orte_job_t *jdata, - orte_process_name_t *proc, - orte_proc_state_t state); - -static bool check_if_duplicate_proc(orte_proc_t *proc, opal_pointer_array_t *migrating_procs); -static int check_if_terminated(opal_pointer_array_t *migrating_procs); -static int check_if_restarted(opal_pointer_array_t *migrating_procs); - -static int check_and_pre_map(opal_list_t *off_procs, - opal_list_t *off_nodes, - orte_snapc_base_quiesce_t *cur_datum); - -static void display_request(opal_list_t *off_procs, - opal_list_t *off_nodes, - orte_snapc_base_quiesce_t *cur_datum); - -/* - * Timer stuff - */ -static void errmgr_crmig_set_time(int idx); -static void errmgr_crmig_display_all_timers(void); -static void errmgr_crmig_clear_timers(void); - -static double errmgr_crmig_get_time(void); -static void errmgr_crmig_display_indv_timer_core(double diff, char *str); -static double timer_start[OPAL_CR_TIMER_MAX]; - -#define ERRMGR_CRMIG_TIMER_START 0 -#define ERRMGR_CRMIG_TIMER_SETUP 1 -#define ERRMGR_CRMIG_TIMER_CKPT 2 -#define ERRMGR_CRMIG_TIMER_TERM 3 -#define ERRMGR_CRMIG_TIMER_RESETUP 4 -#define ERRMGR_CRMIG_TIMER_RESTART 5 -#define ERRMGR_CRMIG_TIMER_FINISH 6 -#define ERRMGR_CRMIG_TIMER_MAX 7 - -#define ERRMGR_CRMIG_CLEAR_TIMERS() \ - { \ - if(OPAL_UNLIKELY(mca_errmgr_hnp_component.crmig_timing_enabled > 0)) { \ - errmgr_crmig_clear_timers(); \ - } \ - } - -#define ERRMGR_CRMIG_SET_TIMER(idx) \ - { \ - if(OPAL_UNLIKELY(mca_errmgr_hnp_component.crmig_timing_enabled > 0)) { \ - errmgr_crmig_set_time(idx); \ - } \ - } - -#define ERRMGR_CRMIG_DISPLAY_ALL_TIMERS() \ - { \ - if(OPAL_UNLIKELY(mca_errmgr_hnp_component.crmig_timing_enabled > 0)) { \ - errmgr_crmig_display_all_timers(); \ - } \ - } - -/************************ - * Function Definitions: Global - ************************/ -int orte_errmgr_hnp_crmig_global_module_init(void) -{ - int ret; - - opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(crmig): init()"); - - migrating_underway = false; - - current_global_jobid = ORTE_JOBID_INVALID; - current_global_jobdata = NULL; - - /* - * Initialize the connection to the orte-migrate tool - */ - if( ORTE_SUCCESS != (ret = orte_errmgr_base_tool_init()) ) { - ORTE_ERROR_LOG(ret); - return ret; - } - - ERRMGR_CRMIG_CLEAR_TIMERS(); - - return ORTE_SUCCESS; -} - -int orte_errmgr_hnp_crmig_global_module_finalize(void) -{ - int ret; - - opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(crmig): finalize()"); - - /* - * Finalize the connection to the orte-migrate tool - */ - if( ORTE_SUCCESS != (ret = orte_errmgr_base_tool_finalize()) ) { - ORTE_ERROR_LOG(ret); - return ret; - } - - migrating_underway = false; - - current_global_jobid = ORTE_JOBID_INVALID; - current_global_jobdata = NULL; - - ERRMGR_CRMIG_CLEAR_TIMERS(); - - return ORTE_SUCCESS; -} - -int orte_errmgr_hnp_crmig_global_predicted_fault(opal_list_t *proc_list, - opal_list_t *node_list, - opal_list_t *suggested_map) -{ - int ret, exit_status = ORTE_SUCCESS; - orte_job_t *jdata = NULL; - int i; - - /* - * JJH: RETURN HERE - * If we are already migrating, then reject this request - */ - if( migrating_underway ) { - ; - } - - /* - * Determine the jobid for this migration - * JJH: Assumes only one job active at any one time - */ - for(i = 0; i < orte_job_data->size; ++i ) { - if (NULL == (jdata = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, i))) { - continue; - } - /* Exclude outselves */ - if( jdata->jobid == ORTE_PROC_MY_NAME->jobid ) { - continue; - } - current_global_jobdata = jdata; - current_global_jobid = jdata->jobid; - break; - } - if( NULL == current_global_jobdata ) { - opal_output(0, "errmgr:hnp(crmig):predicted_fault(): Global) Error: Cannot find the jdata for the current job."); - ORTE_ERROR_LOG(ORTE_ERROR); - return ORTE_ERROR; - } - current_global_jobdata->controls |= ORTE_JOB_CONTROL_RECOVERABLE; - - current_migration_status = ORTE_ERRMGR_MIGRATE_STATE_REQUEST; - if( ORTE_SUCCESS != (ret = orte_errmgr_base_migrate_update(current_migration_status)) ) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - - /************************* - * Kick off the migration - *************************/ - if( ORTE_SUCCESS != (ret = errmgr_crmig_global_migrate(proc_list, node_list, suggested_map)) ) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - - /************************ - * Set up the Command Line listener again - *************************/ - if( ORTE_ERRMGR_MIGRATE_STATE_ERROR != current_migration_status ) { - if( ORTE_SUCCESS != (ret = orte_errmgr_base_migrate_update(ORTE_ERRMGR_MIGRATE_STATE_NONE)) ) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - - opal_show_help("help-orte-errmgr-hnp.txt", "crmig_migrated_job", true); - } - current_migration_status = ORTE_ERRMGR_MIGRATE_STATE_NONE; - - cleanup: - return exit_status; -} - -int orte_errmgr_hnp_crmig_global_update_state(orte_jobid_t job, - orte_job_state_t jobstate, - orte_process_name_t *proc_name, - orte_proc_state_t state, - pid_t pid, - orte_exit_code_t exit_code) -{ - orte_job_t *jdata = NULL; - int ret = ORTE_SUCCESS; - - /* - * if orte is trying to shutdown, just let it - */ - if( mca_errmgr_hnp_component.term_in_progress ) { - return ORTE_SUCCESS; - } - - /* - * Get the job data object for this process - */ - if( NULL != proc_name ) { /* Get job from proc's jobid */ - jdata = orte_get_job_data_object(proc_name->jobid); - } else { /* Get from the general job */ - jdata = orte_get_job_data_object(job); - } - if( NULL == jdata ) { - opal_output(0, "%s errmgr:hnp(crmig):update_state() Error: Cannot find job %s for Process %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(job), - (NULL == proc_name) ? "NULL" : ORTE_NAME_PRINT(proc_name) ); - ret = ORTE_ERROR; - ORTE_ERROR_LOG(ret); - return ret; - } - - /* - * If this is a tool, ignore - */ - if( jdata->num_apps == 0 && - OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_JOBID, ORTE_PROC_MY_NAME, proc_name) ) { - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "%s errmgr:hnp(crmig): An external tool disconnected. Ignore...", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - return ORTE_SUCCESS; - } - - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "%s errmgr:hnp(crmig): job %s reported state %s" - " for proc %s state %s exit_code %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(job), - orte_job_state_to_str(jobstate), - (NULL == proc_name) ? "NULL" : ORTE_NAME_PRINT(proc_name), - orte_proc_state_to_str(state), exit_code)); - - if( ORTE_PROC_STATE_ABORTED_BY_SIG == state || - ORTE_PROC_STATE_COMM_FAILED == state ) { - if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_crmig_global_process_fault(jdata, proc_name, state)) ) { - ORTE_ERROR_LOG(ret); - return ret; - } - } - else if( ORTE_PROC_STATE_KILLED_BY_CMD == state ) { - if( migrating_underway ) { - /* If we are migrating, then we need to mask this to prevent the lower level from terminating us */ - mca_errmgr_hnp_component.ignore_current_update = true; - orte_errmgr_hnp_update_proc(jdata, proc_name, state, 0, exit_code); - } - } - - return ORTE_SUCCESS; -} - -int orte_errmgr_hnp_crmig_global_suggest_map_targets(orte_proc_t *proc, - orte_node_t *oldnode, - opal_list_t *node_list) -{ - int exit_status = ORTE_SUCCESS; - opal_list_item_t *item = NULL, *m_item = NULL; - orte_errmgr_predicted_map_t *onto_map = NULL, *current_proc_map = NULL; - orte_node_t *node = NULL; - bool found = false; - int num_suggested = 0; - orte_std_cntr_t i_proc; - orte_proc_t *peer_proc = NULL; - - /* - * If not migrating, then suggest nothing - */ - if( !migrating_underway ) { - return ORTE_SUCCESS; - } - - /* - * First look for an exclusive mapping for this process - */ - for(item = opal_list_get_first(current_onto_mapping_exclusive); - item != opal_list_get_end(current_onto_mapping_exclusive); - item = opal_list_get_next(item) ) { - onto_map = (orte_errmgr_predicted_map_t*) item; - if( onto_map->proc_name.vpid == proc->name.vpid ) { - current_proc_map = onto_map; - break; - } - } - - /* - * If there is an exclusive mapping then... - */ - if( NULL != current_proc_map ) { - /* - * If we made an exclusive mapping during the check_and_pre_map() - * then honor it here. - */ - if( NULL != current_proc_map->pre_map_fixed_node ) { - for( item = opal_list_get_first(node_list); - item != opal_list_get_end(node_list); - item = opal_list_get_next(item) ) { - node = (orte_node_t*)item; - - /* Exclude all other nodes */ - found = false; - - if( 0 == strncmp(node->name, current_proc_map->pre_map_fixed_node, - strlen(current_proc_map->pre_map_fixed_node)) ) { - found = true; - break; - } - if( !found ) { - opal_list_remove_item(node_list, item); - OBJ_RELEASE(item); - continue; - } else { - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(crmig):suggest() ------- Fixed use of node [%15s : %10s -> %10s (%10s)] -------", - ORTE_NAME_PRINT(&proc->name), oldnode->name, - current_proc_map->pre_map_fixed_node, node->name)); - } - } - - /* All done with mapping */ - exit_status = ORTE_SUCCESS; - goto cleanup; - } - - /* - * If 'off_current_node' then exclude current node - */ - if( current_proc_map->off_current_node ) { - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(crmig):suggest() ------- Remove old node (info) [%15s : %10s] -------", - ORTE_NAME_PRINT(&proc->name), oldnode->name)); - for( item = opal_list_get_first(node_list); - item != opal_list_get_end(node_list); - item = opal_list_get_next(item) ) { - node = (orte_node_t*)item; - - /* Exclude the old node */ - if( node == oldnode ) { - opal_list_remove_item(node_list, item); - OBJ_RELEASE(item); - break; - } - } - } - - /* - * If 'map_proc_name' then map to the node where this process resides - * Note: Only do this if there was no 'other' node suggested. If there - * was an 'other' node suggested then we need to honor that before - * we honor the peer suggestion. - */ - if( ORTE_VPID_INVALID != current_proc_map->map_proc_name.vpid && - current_proc_map->proc_name.vpid != current_proc_map->map_proc_name.vpid && - NULL == current_proc_map->map_node_name ) { - /* - * Find the node containting the target process - */ - for(i_proc = 0; i_proc < opal_pointer_array_get_size(current_global_jobdata->procs); ++i_proc) { - peer_proc = (orte_proc_t*)opal_pointer_array_get_item(current_global_jobdata->procs, i_proc); - if( NULL == peer_proc ) { - continue; - } - if( peer_proc->name.vpid == current_proc_map->map_proc_name.vpid ) { - current_proc_map->map_node_name = strdup(peer_proc->node->name); - break; - } - } - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(crmig):suggest() ------- Force use of node with proc [%15s -> %15s: %10s -> %10s] -------", - ORTE_NAME_PRINT(&proc->name), ORTE_NAME_PRINT(&peer_proc->name), - oldnode->name, current_proc_map->map_node_name)); - } - - /* - * If 'map_node_name' then use this node exclusively - */ - if( NULL != current_proc_map->map_node_name ) { - for( item = opal_list_get_first(node_list); - item != opal_list_get_end(node_list); - item = opal_list_get_next(item) ) { - node = (orte_node_t*)item; - - /* Exclude all nodes not in the include list */ - found = false; - - if( 0 == strncmp(node->name, current_proc_map->map_node_name, strlen(current_proc_map->map_node_name)) ) { - found = true; - } - if( !found ) { - opal_list_remove_item(node_list, item); - OBJ_RELEASE(item); - continue; - } else { - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(crmig):suggest() ------- Force use of node [%15s : %10s -> %10s (%10s)] -------", - ORTE_NAME_PRINT(&proc->name), oldnode->name, - current_proc_map->map_node_name, node->name)); - } - } - - /* All done with mapping */ - exit_status = ORTE_SUCCESS; - goto cleanup; - } - - /* - * Otherwise then map as if there was no exclusive mapping - */ - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(crmig):suggest() ------- Suggesting as if non-exclusive [%15s : 0x%x : %10s] -------", - ORTE_NAME_PRINT(&proc->name), proc->state, oldnode->name)); - } - /* - * If no exclusive mapping (or exclusive did not yield any results) then... - */ - else { - /* - * Remove the old node from the list, if there are more than 1 nodes available - */ - if(1 < opal_list_get_size(node_list) ) { - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(crmig):suggest() ------- Remove old node [%15s : %10s] -------", - ORTE_NAME_PRINT(&proc->name), oldnode->name)); - for( item = opal_list_get_first(node_list); - item != opal_list_get_end(node_list); - item = opal_list_get_next(item) ) { - node = (orte_node_t*)item; - - /* Exclude the old node */ - if( node == oldnode ) { - opal_list_remove_item(node_list, item); - OBJ_RELEASE(item); - break; - } - } - } - } - - /* - * If we do not have any general suggestions, then just return - */ - if( opal_list_get_size(current_onto_mapping_general) <= 0 ) { - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(crmig):suggest() ------- No suggestions for target [%15s : 0x%x : %10s] -------", - ORTE_NAME_PRINT(&proc->name), proc->state, oldnode->name)); - exit_status = ORTE_SUCCESS; - goto cleanup; - } - - /* - * Otherwise look through the general suggestions as an include list - */ - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(crmig):suggest() ------- Suggest a target for [%15s : 0x%x : %10s] -------", - ORTE_NAME_PRINT(&proc->name), proc->state, oldnode->name)); - - num_suggested = 0; - for( item = opal_list_get_first(node_list); - item != opal_list_get_end(node_list); - item = opal_list_get_next(item) ) { - node = (orte_node_t*)item; - - /* Exclude all nodes not in the include list */ - found = false; - - for(m_item = opal_list_get_first(current_onto_mapping_general); - m_item != opal_list_get_end(current_onto_mapping_general); - m_item = opal_list_get_next(m_item) ) { - onto_map = (orte_errmgr_predicted_map_t*) m_item; - - if( 0 == strncmp(node->name, onto_map->map_node_name, strlen(onto_map->map_node_name)) ) { - found = true; - break; - } - } - if( !found ) { - opal_list_remove_item(node_list, item); - OBJ_RELEASE(item); - continue; - } - - ++num_suggested; - - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(crmig):suggest() ------- Suggesting target %2d [%15s : 0x%x : %10s -> %10s] -------", - num_suggested, ORTE_NAME_PRINT(&proc->name), proc->state, oldnode->name, node->name)); - } - - cleanup: - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(crmig):suggest() ------- Suggested %2d nodes for [%15s : 0x%x : %10s] -------", - (int)opal_list_get_size(node_list), ORTE_NAME_PRINT(&proc->name), proc->state, oldnode->name)); - - return exit_status; -} - -int orte_errmgr_hnp_crmig_global_ft_event(int state) -{ - return ORTE_SUCCESS; -} - - -/************************ - * Function Definitions: Static - ************************/ -static int orte_errmgr_hnp_crmig_global_process_fault(orte_job_t *jdata, - orte_process_name_t *proc_name, - orte_proc_state_t state) -{ - /* - * JJH: Todo - * The expected logic here is: - * if( a daemon with children fails ) { - * abort migration. - * } - * if( a daemon without children fails ) { - * continue. No processes lost - * } - * if( an application process fails ) { - * abort migration. Might be a bad checkpoint, or a process that we were - * not migrating that died. - * } - * else { - * continue; - * } - */ - if( proc_name->jobid == ORTE_PROC_MY_NAME->jobid ) { - errmgr_crmig_process_fault_daemon(jdata, proc_name, state); - } else { - errmgr_crmig_process_fault_app(jdata, proc_name, state); - } - - return ORTE_SUCCESS; -} - -static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_nodes, opal_list_t *onto_maps) -{ - int ret, exit_status = ORTE_SUCCESS; - orte_std_cntr_t i_node; - orte_std_cntr_t i_proc; - orte_node_t *node = NULL; - orte_proc_t *proc = NULL; - bool found = false; - orte_snapc_base_quiesce_t *cur_datum = NULL; - bool close_iof_stdin = false; - orte_process_name_t iof_name = {ORTE_JOBID_INVALID, 0}; - char * err_str_procs = NULL; - char * err_str_nodes = NULL; - char * tmp_str = NULL; - orte_errmgr_predicted_proc_t *off_proc = NULL; - orte_errmgr_predicted_node_t *off_node = NULL; - orte_errmgr_predicted_map_t *onto_map = NULL; - opal_list_item_t *item = NULL; - - ERRMGR_CRMIG_CLEAR_TIMERS(); - ERRMGR_CRMIG_SET_TIMER(ERRMGR_CRMIG_TIMER_START); - - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(crmig):migrate() ------- Migrating (%3d, %3d, %3d) -------", - (int)opal_list_get_size(off_procs), - (int)opal_list_get_size(off_nodes), - (int)opal_list_get_size(onto_maps))); - - /* - * Modeled after orte_plm_base_reset_job - */ - cur_datum = OBJ_NEW(orte_snapc_base_quiesce_t); - cur_datum->migrating = true; - migrating_underway = true; - mca_errmgr_hnp_component.crmig_in_progress = true; - - current_migration_status = ORTE_ERRMGR_MIGRATE_STATE_RUNNING; - if( ORTE_SUCCESS != (ret = orte_errmgr_base_migrate_update(current_migration_status)) ) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - - /* - * Check to make sure that the 'off' and 'onto' nodes exist - * - if 'onto' nodes do not, then add them (JJH XXX) - * - if 'off' nodes do not, then return an error (JJH XXX) - * JJH TODO... - */ - - /* - * Copy over the onto_nodes so we can suggest them later - */ - if( NULL != current_onto_mapping_general ) { - OBJ_RELEASE(current_onto_mapping_general); - current_onto_mapping_general = NULL; - } - if( NULL != current_onto_mapping_exclusive ) { - OBJ_RELEASE(current_onto_mapping_exclusive); - current_onto_mapping_exclusive = NULL; - } - current_onto_mapping_general = OBJ_NEW(opal_list_t); - current_onto_mapping_exclusive = OBJ_NEW(opal_list_t); - if( NULL != onto_maps ) { - while( NULL != (item = opal_list_remove_first(onto_maps)) ) { - onto_map = (orte_errmgr_predicted_map_t*) item; - /* Determine if process exclude mapping, or general */ - if( onto_map->proc_name.vpid == ORTE_VPID_INVALID ) { - opal_list_append(current_onto_mapping_general, item); - } else { - opal_list_append(current_onto_mapping_exclusive, item); - } - } - } - - for(item = opal_list_get_first(current_onto_mapping_exclusive); - item != opal_list_get_end(current_onto_mapping_exclusive); - item = opal_list_get_next(item) ) { - onto_map = (orte_errmgr_predicted_map_t*) item; - /* - * Find the node currently containing this process - */ - found = false; - for(i_proc = 0; i_proc < opal_pointer_array_get_size(current_global_jobdata->procs); ++i_proc) { - proc = (orte_proc_t*)opal_pointer_array_get_item(current_global_jobdata->procs, i_proc); - if( NULL == proc ) { - continue; - } - - if( proc->name.vpid == onto_map->proc_name.vpid) { - found = true; - break; - } - } - - /* - * Check to see if this process hsould be skipped - */ - if( !onto_map->off_current_node && - (ORTE_VPID_INVALID == onto_map->map_proc_name.vpid || - onto_map->proc_name.vpid == onto_map->map_proc_name.vpid ) && - (NULL == onto_map->map_node_name || - 0 == strncmp(onto_map->map_node_name, proc->node->name, strlen(proc->node->name))) ) { - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(crmig):migrate() ------- Process %15s does not wish to move -------", - ORTE_NAME_PRINT(&proc->name))); - - } else { - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(crmig):migrate() ------- Process %15s will be moved -------", - ORTE_NAME_PRINT(&proc->name))); - /* - * Set the process to restarting - */ - proc->state = ORTE_PROC_STATE_MIGRATING; - - opal_pointer_array_add(&(cur_datum->migrating_procs), (void*)proc); - OBJ_RETAIN(proc); - (cur_datum->num_migrating)++; - - if( current_global_jobdata->stdin_target == proc->name.vpid ) { - close_iof_stdin = true; - iof_name.jobid = proc->name.jobid; - iof_name.vpid = proc->name.vpid; - ORTE_EPOCH_SET(iof_name.epoch,proc->name.epoch); - } - } - } - - migrating_terminated = false; - migrating_restarted = false; - - /* - * Create a list of processes to migrate, if 'off_nodes' specified - */ - for(item = opal_list_get_first(off_nodes); - item != opal_list_get_end(off_nodes); - item = opal_list_get_next(item) ) { - off_node = (orte_errmgr_predicted_node_t*)item; - - /* - * Find the node in the job structure - * - Make sure that 'odin00' doesn't match all 'odin00*' - */ - found = false; - for(i_node = 0; i_node < opal_pointer_array_get_size(current_global_jobdata->map->nodes); ++i_node) { - node = (orte_node_t*)opal_pointer_array_get_item(current_global_jobdata->map->nodes, i_node); - if( NULL == node ) { - continue; - } - - if( 0 == strncmp(node->name, off_node->node_name, strlen(off_node->node_name)) ) { - found = true; - break; - } - } - if( !found ) { - ; /* Warn about invalid node */ - } else { - /* - * Add all processes from this node - */ - for(i_proc = 0; i_proc < opal_pointer_array_get_size(node->procs); ++i_proc) { - proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i_proc); - if( NULL == proc ) { - continue; - } - - /* - * Set the process to restarting - */ - proc->state = ORTE_PROC_STATE_MIGRATING; - - opal_pointer_array_add(&(cur_datum->migrating_procs), (void*)proc); - OBJ_RETAIN(proc); - (cur_datum->num_migrating)++; - - if( current_global_jobdata->stdin_target == proc->name.vpid ) { - close_iof_stdin = true; - iof_name.jobid = proc->name.jobid; - iof_name.vpid = proc->name.vpid; - ORTE_EPOCH_SET(iof_name.epoch,proc->name.epoch); - } - } - } - } - - /* - * Create a list of processes to migrate, if 'off_procs' specified - */ - for(item = opal_list_get_first(off_procs); - item != opal_list_get_end(off_procs); - item = opal_list_get_next(item) ) { - off_proc = (orte_errmgr_predicted_proc_t*)item; - - /* - * Find the process in the job structure - */ - found = false; - for(i_proc = 0; i_proc < opal_pointer_array_get_size(current_global_jobdata->procs); ++i_proc) { - proc = (orte_proc_t*)opal_pointer_array_get_item(current_global_jobdata->procs, i_proc); - if( NULL == proc ) { - continue; - } - - if( proc->name.vpid == off_proc->proc_name.vpid) { - found = true; - break; - } - } - /* - * Make sure the process is not listed multiple times - */ - if( found ) { - found = check_if_duplicate_proc(proc, &(cur_datum->migrating_procs)); - if( !found ) { - /* - * Set the process to restarting - */ - proc->state = ORTE_PROC_STATE_MIGRATING; - - opal_pointer_array_add(&(cur_datum->migrating_procs), (void*)proc); - OBJ_RETAIN(proc); - (cur_datum->num_migrating)++; - - if( current_global_jobdata->stdin_target == proc->name.vpid ) { - close_iof_stdin = true; - iof_name.jobid = proc->name.jobid; - iof_name.vpid = proc->name.vpid; - ORTE_EPOCH_SET(iof_name.epoch,proc->name.epoch); - } - } - } - } - - /* - * If we did not find any processes to migrate, then throw a warning, and skip it. - */ - if( 0 >= cur_datum->num_migrating ) { - for(item = opal_list_get_first(off_nodes); - item != opal_list_get_end(off_nodes); - item = opal_list_get_next(item) ) { - off_node = (orte_errmgr_predicted_node_t*)item; - if( NULL != err_str_nodes ) { - asprintf(&tmp_str, "%s, %s", err_str_nodes, off_node->node_name); - free(err_str_nodes); - err_str_nodes = strdup(tmp_str); - free(tmp_str); - tmp_str = NULL; - } else { - asprintf(&err_str_nodes, "%s", off_node->node_name); - } - } - - for(item = opal_list_get_first(off_procs); - item != opal_list_get_end(off_procs); - item = opal_list_get_next(item) ) { - off_proc = (orte_errmgr_predicted_proc_t*)item; - if( NULL != err_str_procs ) { - asprintf(&tmp_str, "%s, %d", err_str_procs, (int)off_proc->proc_name.vpid); - free(err_str_procs); - err_str_procs = strdup(tmp_str); - free(tmp_str); - tmp_str = NULL; - } else { - asprintf(&err_str_procs, "%d", off_proc->proc_name.vpid); - } - } - - opal_show_help("help-orte-errmgr-hnp.txt", "crmig_no_migrating_procs", true, - err_str_nodes, - err_str_procs); - - current_migration_status = ORTE_ERRMGR_MIGRATE_STATE_ERROR; - if( ORTE_SUCCESS != (ret = orte_errmgr_base_migrate_update(current_migration_status)) ) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - goto cleanup; - } - - /* - * Final pass on the migration list to pre-map processes and remove - * processes that should not be migrated. - */ - if( ORTE_SUCCESS != (ret = check_and_pre_map(off_procs, off_nodes, cur_datum)) ) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - - /* - * Display the request before processing it. - */ - display_request(off_procs, off_nodes, cur_datum); - - ERRMGR_CRMIG_SET_TIMER(ERRMGR_CRMIG_TIMER_SETUP); - - /* - * Checkpoint the job - * - Hold all non-migrating processes - * - Abort the marked processes - * - - */ - current_migration_status = ORTE_ERRMGR_MIGRATE_STATE_RUN_CKPT; - if( ORTE_SUCCESS != (ret = orte_errmgr_base_migrate_update(current_migration_status)) ) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - - opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(crmig):migrate() ------- Starting the checkpoint of job %s -------", - ORTE_JOBID_PRINT(current_global_jobdata->jobid)); - - if( ORTE_SUCCESS != (ret = orte_snapc.start_ckpt(cur_datum)) ) { - opal_output(0, "errmgr:hnp(crmig):migrate() Error: Unable to start the checkpoint."); - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - - ERRMGR_CRMIG_SET_TIMER(ERRMGR_CRMIG_TIMER_CKPT); - - /* - * Terminate the migrating processes - */ - opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(crmig):migrate() ------- Terminate old processes in job %s -------", - ORTE_JOBID_PRINT(current_global_jobdata->jobid)); - - orte_plm.terminate_procs(&cur_datum->migrating_procs); - - /* - * Clear the IOF stdin target if necessary - */ - if( close_iof_stdin ) { - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(crmig):migrate() ------- Closing old STDIN target for job %s (%s)-------", - ORTE_JOBID_PRINT(current_global_jobdata->jobid), - ORTE_NAME_PRINT(&iof_name) )); - - orte_iof.close(&iof_name, ORTE_IOF_STDIN); - } - - /* - * Wait for the processes to finish terminating - */ - opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(crmig):migrate() ------- Waiting for termination -------"); - - while( !migrating_terminated ) { - opal_progress(); - check_if_terminated(&(cur_datum->migrating_procs)); - } - - ERRMGR_CRMIG_SET_TIMER(ERRMGR_CRMIG_TIMER_TERM); - - /* - * Start remapping the processes - */ - opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(crmig):migrate() ------- Checkpoint finished, setting up job %s -------", - ORTE_JOBID_PRINT(current_global_jobdata->jobid)); - - current_migration_status = ORTE_ERRMGR_MIGRATE_STATE_STARTUP; - if( ORTE_SUCCESS != (ret = orte_errmgr_base_migrate_update(current_migration_status)) ) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - - /* - * Reset the job parameters for restart - * This will set the state of the job to 'restart' - */ - orte_plm_base_reset_job(current_global_jobdata); - - /* - * Adjust the application context information - */ - for(i_proc = 0; i_proc < opal_pointer_array_get_size(&(cur_datum->migrating_procs)); ++i_proc) { - proc = (orte_proc_t*)opal_pointer_array_get_item(&(cur_datum->migrating_procs), i_proc); - if( NULL == proc ) { - continue; - } - - if( ORTE_SUCCESS != (ret = orte_errmgr_base_update_app_context_for_cr_recovery(current_global_jobdata, - proc, - &(cur_datum->ss_snapshot->local_snapshots))) ) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "\tAdjusted: \"%s\" [0x%d] [%s]\n", - ORTE_NAME_PRINT(&proc->name), proc->state, proc->node->name)); - } - - ERRMGR_CRMIG_SET_TIMER(ERRMGR_CRMIG_TIMER_RESETUP); - - /* - * Restart the job - * - spawn function will remap and launch the replacement proc(s) - */ - opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(crmig):migrate() ------- Respawning migrating processes in job %s -------", - ORTE_JOBID_PRINT(current_global_jobdata->jobid)); - - orte_plm.spawn(current_global_jobdata); - - - opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(crmig):migrate() ------- Waiting for restart -------"); - - migrating_restarted = false; - while( !migrating_restarted ) { - opal_progress(); - check_if_restarted(&(cur_datum->migrating_procs)); - } - - ERRMGR_CRMIG_SET_TIMER(ERRMGR_CRMIG_TIMER_RESTART); - - /* - * Finish the checkpoint - */ - opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(crmig):migrate() ------- Reconnecting processes in job %s -------", - ORTE_JOBID_PRINT(current_global_jobdata->jobid)); - - if( ORTE_SUCCESS != (ret = orte_snapc.end_ckpt(cur_datum)) ) { - opal_output(0, "errmgr:hnp(crmig):migrate() Error: Unable to end the checkpoint."); - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - - /* - * All done - */ - opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(crmig):migrate() ------- Finished migrating processes in job %s -------", - ORTE_JOBID_PRINT(current_global_jobdata->jobid)); - - OBJ_RELEASE(cur_datum); - - current_migration_status = ORTE_ERRMGR_MIGRATE_STATE_FINISH; - if( ORTE_SUCCESS != (ret = orte_errmgr_base_migrate_update(current_migration_status)) ) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - - ERRMGR_CRMIG_SET_TIMER(ERRMGR_CRMIG_TIMER_FINISH); - ERRMGR_CRMIG_DISPLAY_ALL_TIMERS(); - - cleanup: - migrating_underway = false; - migrating_terminated = false; - migrating_restarted = false; - mca_errmgr_hnp_component.crmig_in_progress = false; - - if( NULL != err_str_procs ) { - free(err_str_procs); - err_str_procs = NULL; - } - - if( NULL != err_str_nodes ) { - free(err_str_nodes); - err_str_nodes = NULL; - } - - return exit_status; -} - -static bool check_if_duplicate_proc(orte_proc_t *proc, opal_pointer_array_t *migrating_procs) -{ - orte_std_cntr_t i_proc; - orte_proc_t *loc_proc = NULL; - - for(i_proc = 0; i_proc < opal_pointer_array_get_size(migrating_procs); ++i_proc) { - loc_proc = (orte_proc_t*)opal_pointer_array_get_item(migrating_procs, i_proc); - if( NULL == loc_proc ) { - continue; - } - if( loc_proc->name.vpid == proc->name.vpid ) { - return true; - } - } - - return false; -} - -static int check_if_terminated(opal_pointer_array_t *migrating_procs) -{ - orte_std_cntr_t i_proc; - orte_proc_t *proc = NULL; - bool is_done; - - is_done = true; - for(i_proc = 0; i_proc < opal_pointer_array_get_size(migrating_procs); ++i_proc) { - proc = (orte_proc_t*)opal_pointer_array_get_item(migrating_procs, i_proc); - if( NULL == proc ) { - continue; - } - - if( !(ORTE_PROC_STATE_KILLED_BY_CMD & proc->state) ) { - is_done = false; - break; - } - } - - if( is_done ) { - migrating_terminated = true; - } - else { - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "\t Still waiting for termination: \"%s\" [0x%x] != [0x%x]\n", - ORTE_NAME_PRINT(&proc->name), proc->state, ORTE_PROC_STATE_KILLED_BY_CMD)); - } - - return ORTE_SUCCESS; -} - -static int check_if_restarted(opal_pointer_array_t *migrating_procs) -{ - orte_std_cntr_t i_proc; - orte_proc_t *proc = NULL; - bool is_done; - - is_done = true; - for(i_proc = 0; i_proc < opal_pointer_array_get_size(migrating_procs); ++i_proc) { - proc = (orte_proc_t*)opal_pointer_array_get_item(migrating_procs, i_proc); - if( NULL == proc ) { - continue; - } - - /* proc->state != ORTE_PROC_STATE_LAUNCHED */ - if( !(ORTE_PROC_STATE_RUNNING & proc->state) ) { - is_done = false; - break; - } - } - - if( is_done ) { - migrating_restarted = true; - } - else { - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "\tStill waiting for restart: \"%s\" [0x%x] != [0x%x]\n", - ORTE_NAME_PRINT(&proc->name), proc->state, ORTE_PROC_STATE_RUNNING)); - } - - return ORTE_SUCCESS; -} - -static void errmgr_crmig_process_fault_app(orte_job_t *jdata, - orte_process_name_t *proc, - orte_proc_state_t state) -{ - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(crmig):process_fault_app() " - "------- Application fault reported! proc %s (0x%x) " - "- %s", - ORTE_NAME_PRINT(proc), - state, - (migrating_underway ? "Migrating" : "Not Migrating") )); - - return; -} - -static void errmgr_crmig_process_fault_daemon(orte_job_t *jdata, - orte_process_name_t *proc, - orte_proc_state_t state) -{ - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(crmig):process_fault_daemon() " - "------- Daemon fault reported! proc %s (0x%x) " - "- %s", - ORTE_NAME_PRINT(proc), - state, - (migrating_underway ? "Migrating" : "Not Migrating") )); - - /* - * Failed communication can be ignored for the most part. - * Make sure to remove the route - * JJH: Check to make sure this is not a new daemon loss. - */ - if( ORTE_PROC_STATE_COMM_FAILED == state ) { - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(crmig):process_fault_daemon() " - "------- Daemon fault reported! proc %s (0x%x) " - "- Communication failure, keep going", - ORTE_NAME_PRINT(proc), - state )); - } - - return; -} - -static int check_and_pre_map(opal_list_t *off_procs, - opal_list_t *off_nodes, - orte_snapc_base_quiesce_t *cur_datum) -{ - /* - * Check the 'off_procs' list for processes that should not be migrated - */ - - /* - * Check the 'current_onto_mapping_exclusive' for processes that are moving - * 'near/with' other processes that are also moving. Be sure to watch out - * for circular deadlock. - */ - - /* - * Use the 'pre_map_fixed_node' structure to fix this process' mapping. - */ - - return ORTE_SUCCESS; -} - -static void display_request(opal_list_t *off_procs, - opal_list_t *off_nodes, - orte_snapc_base_quiesce_t *cur_datum) -{ - orte_std_cntr_t i_node; - orte_std_cntr_t i_proc; - orte_node_t *node = NULL; - orte_proc_t *proc = NULL; - bool found = false; - char * status_str = NULL; - char * tmp_str = NULL; - orte_errmgr_predicted_proc_t *off_proc = NULL; - orte_errmgr_predicted_node_t *off_node = NULL; - orte_errmgr_predicted_map_t *onto_map = NULL; - opal_list_item_t *item = NULL; - - /* - * Display all requested processes to migrate - */ - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(crmig):migrate() Requested Processes to migrate: (%d procs)\n", - (int) opal_list_get_size(off_procs) )); - for(item = opal_list_get_first(off_procs); - item != opal_list_get_end(off_procs); - item = opal_list_get_next(item) ) { - off_proc = (orte_errmgr_predicted_proc_t*)item; - - /* - * Find the process in the job structure - */ - found = false; - for(i_proc = 0; i_proc < opal_pointer_array_get_size(current_global_jobdata->procs); ++i_proc) { - proc = (orte_proc_t*)opal_pointer_array_get_item(current_global_jobdata->procs, i_proc); - if( NULL == proc ) { - continue; - } - - if( proc->name.vpid == off_proc->proc_name.vpid) { - found = true; - break; - } - } - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "\t%s (Rank %3d) on node %s\n", - ORTE_NAME_PRINT(&proc->name), (int)off_proc->proc_name.vpid, proc->node->name)); - } - - /* - * Display Off Nodes - */ - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(crmig):migrate() Requested Nodes to migration: (%d nodes)\n", - (int)opal_list_get_size(off_nodes) )); - - for(item = opal_list_get_first(off_nodes); - item != opal_list_get_end(off_nodes); - item = opal_list_get_next(item) ) { - off_node = (orte_errmgr_predicted_node_t*)item; - - for(i_node = 0; i_node < opal_pointer_array_get_size(current_global_jobdata->map->nodes); ++i_node) { - node = (orte_node_t*)opal_pointer_array_get_item(current_global_jobdata->map->nodes, i_node); - if( NULL == node ) { - continue; - } - - found = false; - if( 0 == strncmp(node->name, off_node->node_name, strlen(off_node->node_name)) ) { - found = true; - break; - } - } - if( found ) { - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "\t\"%s\" \t%d\n", - node->name, node->num_procs)); - for(i_proc = 0; i_proc < opal_pointer_array_get_size(node->procs); ++i_proc) { - proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i_proc); - if( NULL == proc ) { - continue; - } - - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "\t\t\"%s\" [0x%x]\n", - ORTE_NAME_PRINT(&proc->name), proc->state)); - } - } - } - - /* - * Suggested onto nodes - */ - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(crmig):migrate() Suggested nodes to migration onto: (%d nodes)\n", - (int)opal_list_get_size(current_onto_mapping_general) )); - for(item = opal_list_get_first(current_onto_mapping_general); - item != opal_list_get_end(current_onto_mapping_general); - item = opal_list_get_next(item) ) { - onto_map = (orte_errmgr_predicted_map_t*) item; - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "\t\"%s\"\n", - onto_map->map_node_name)); - } - - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(crmig):migrate() Suggested nodes to migration onto (exclusive): (%d nodes)\n", - (int)opal_list_get_size(current_onto_mapping_exclusive) )); - for(item = opal_list_get_first(current_onto_mapping_exclusive); - item != opal_list_get_end(current_onto_mapping_exclusive); - item = opal_list_get_next(item) ) { - onto_map = (orte_errmgr_predicted_map_t*) item; - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "\t%d\t(%c)\t\"%s\"\n", - onto_map->proc_name.vpid, - (onto_map->off_current_node ? 'T' : 'F'), - onto_map->map_node_name)); - } - - /* - * Display all processes scheduled to migrate - */ - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "errmgr:hnp(crmig):migrate() All Migrating Processes: (%d procs)\n", - cur_datum->num_migrating)); - for(i_proc = 0; i_proc < opal_pointer_array_get_size(&(cur_datum->migrating_procs)); ++i_proc) { - proc = (orte_proc_t*)opal_pointer_array_get_item(&(cur_datum->migrating_procs), i_proc); - if( NULL == proc ) { - continue; - } - - OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, - "\t\"%s\" [0x%x] [%s]\n", - ORTE_NAME_PRINT(&proc->name), proc->state, proc->node->name)); - - if( NULL == status_str ) { - asprintf(&status_str, "\t%s Rank %d on Node %s\n", - ORTE_NAME_PRINT(&proc->name), - (int)proc->name.vpid, - proc->node->name); - } else { - tmp_str = strdup(status_str); - free(status_str); - status_str = NULL; - asprintf(&status_str, "%s\t%s Rank %d on Node %s\n", - tmp_str, - ORTE_NAME_PRINT(&proc->name), - (int)proc->name.vpid, - proc->node->name); - } - } - - opal_show_help("help-orte-errmgr-hnp.txt", "crmig_migrating_job", true, - status_str); - - if( NULL != tmp_str ) { - free(tmp_str); - tmp_str = NULL; - } - - if( NULL != status_str ) { - free(status_str); - status_str = NULL; - } - - return; -} - -/************************ - * Timing - ************************/ -static void errmgr_crmig_set_time(int idx) -{ - if(idx < ERRMGR_CRMIG_TIMER_MAX ) { - if( timer_start[idx] <= 0.0 ) { - timer_start[idx] = errmgr_crmig_get_time(); - } - } -} - -static void errmgr_crmig_display_all_timers(void) -{ - double diff = 0.0; - char * label = NULL; - - opal_output(0, "Process Migration Timing: ******************** Summary Begin\n"); - - /********** Structure Setup **********/ - label = strdup("Setup"); - diff = timer_start[ERRMGR_CRMIG_TIMER_SETUP] - timer_start[ERRMGR_CRMIG_TIMER_START]; - errmgr_crmig_display_indv_timer_core(diff, label); - free(label); - - /********** Checkpoint **********/ - label = strdup("Checkpoint"); - diff = timer_start[ERRMGR_CRMIG_TIMER_CKPT] - timer_start[ERRMGR_CRMIG_TIMER_SETUP]; - errmgr_crmig_display_indv_timer_core(diff, label); - free(label); - - /********** Termination **********/ - label = strdup("Terminate"); - diff = timer_start[ERRMGR_CRMIG_TIMER_TERM] - timer_start[ERRMGR_CRMIG_TIMER_CKPT]; - errmgr_crmig_display_indv_timer_core(diff, label); - free(label); - - /********** Setup new job **********/ - label = strdup("Setup Relaunch"); - diff = timer_start[ERRMGR_CRMIG_TIMER_RESETUP] - timer_start[ERRMGR_CRMIG_TIMER_TERM]; - errmgr_crmig_display_indv_timer_core(diff, label); - free(label); - - /********** Restart **********/ - label = strdup("Restart"); - diff = timer_start[ERRMGR_CRMIG_TIMER_RESTART] - timer_start[ERRMGR_CRMIG_TIMER_RESETUP]; - errmgr_crmig_display_indv_timer_core(diff, label); - free(label); - - /********** Finish **********/ - label = strdup("Finalize"); - diff = timer_start[ERRMGR_CRMIG_TIMER_FINISH] - timer_start[ERRMGR_CRMIG_TIMER_RESTART]; - errmgr_crmig_display_indv_timer_core(diff, label); - free(label); - - opal_output(0, "Process Migration Timing: ******************** Summary End\n"); -} - -static void errmgr_crmig_clear_timers(void) -{ - int i; - for(i = 0; i < ERRMGR_CRMIG_TIMER_MAX; ++i) { - timer_start[i] = 0.0; - } -} - -static double errmgr_crmig_get_time(void) -{ - double wtime; - -#if OPAL_TIMER_USEC_NATIVE - wtime = (double)opal_timer_base_get_usec() / 1000000.0; -#else - struct timeval tv; - gettimeofday(&tv, NULL); - wtime = tv.tv_sec; - wtime += (double)tv.tv_usec / 1000000.0; -#endif - - return wtime; -} - -static void errmgr_crmig_display_indv_timer_core(double diff, char *str) -{ - double total = 0; - double perc = 0; - - total = timer_start[ERRMGR_CRMIG_TIMER_MAX-1] - timer_start[ERRMGR_CRMIG_TIMER_START]; - perc = (diff/total) * 100; - - opal_output(0, - "errmgr_crmig: timing: %-20s = %10.2f s\t%10.2f s\t%6.2f\n", - str, - diff, - total, - perc); - return; -} - -#endif /* OPAL_ENABLE_FT_CR */ diff --git a/orte/mca/errmgr/hnp/help-orte-errmgr-hnp.txt b/orte/mca/errmgr/hnp/help-orte-errmgr-hnp.txt deleted file mode 100644 index 836e46f4b0..0000000000 --- a/orte/mca/errmgr/hnp/help-orte-errmgr-hnp.txt +++ /dev/null @@ -1,71 +0,0 @@ - -*- text -*- -# -# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. -# -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# -# This is the US/English general help file for ORTE Errmgr HNP module. -# -[errmgr-hnp:unknown-job-error] -An error has occurred in an unknown job. This generally should not happen -except due to an internal ORTE error. - -Job state: %s - -This information should probably be reported to the OMPI developers. -# -[errmgr-hnp:daemon-died] -The system has lost communication with the following daemon: - -Daemon: %s -Node: %s - -The reason for the lost communication channel is unknown. Possible -reasons include failure of the daemon itself, failure of the -connecting fabric/switch, and loss of the host node. Please -check with your system administrator to try and determine the -source of the problem. - -Your job is being terminated as a result. -# -[errmgr-hnp:cannot-relocate] -The system is unable to relocate the specified process: - -Process: %s - -because the application for that process could not be found. This -appears to be a system error. Please report it to the ORTE -developers. - -[autor_recovering_job] -Notice: The processes listed below failed unexpectedly. - Using the last checkpoint to recover the job. - Please standby. -%s -[autor_recovery_complete] -Notice: The job has been successfully recovered from the - last checkpoint. -[autor_failed_to_recover_proc] -Error: The process below has failed. There is no checkpoint available for - this job, so we are terminating the application since automatic - recovery cannot occur. -Internal Name: %s -MCW Rank: %d - -[crmig_migrating_job] -Notice: A migration of this job has been requested. - The processes below will be migrated. - Please standby. -%s -[crmig_migrated_job] -Notice: The processes have been successfully migrated to/from the specified - machines. -[crmig_no_migrating_procs] -Warning: Could not find any processes to migrate on the nodes specified. - You provided the following: -Nodes: %s -Procs: %s diff --git a/orte/mca/errmgr/orted/Makefile.am b/orte/mca/errmgr/orted/Makefile.am deleted file mode 100644 index dae952bcf6..0000000000 --- a/orte/mca/errmgr/orted/Makefile.am +++ /dev/null @@ -1,38 +0,0 @@ -# -# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -EXTRA_DIST = .windows - -dist_pkgdata_DATA = help-orte-errmgr-orted.txt - -sources = \ - errmgr_orted.h \ - errmgr_orted_component.c \ - errmgr_orted.c - -# Make the output library in this directory, and name it either -# mca__.la (for DSO builds) or libmca__.la -# (for static builds). - -if MCA_BUILD_orte_errmgr_orted_DSO -component_noinst = -component_install = mca_errmgr_orted.la -else -component_noinst = libmca_errmgr_orted.la -component_install = -endif - -mcacomponentdir = $(pkglibdir) -mcacomponent_LTLIBRARIES = $(component_install) -mca_errmgr_orted_la_SOURCES = $(sources) -mca_errmgr_orted_la_LDFLAGS = -module -avoid-version - -noinst_LTLIBRARIES = $(component_noinst) -libmca_errmgr_orted_la_SOURCES =$(sources) -libmca_errmgr_orted_la_LDFLAGS = -module -avoid-version diff --git a/orte/mca/errmgr/orted/configure.m4 b/orte/mca/errmgr/orted/configure.m4 deleted file mode 100644 index 8c10aa375c..0000000000 --- a/orte/mca/errmgr/orted/configure.m4 +++ /dev/null @@ -1,19 +0,0 @@ -# -*- shell-script -*- -# -# Copyright (c) 2011 Los Alamos National Security, LLC. -# All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# -# MCA_errmgr_orted_CONFIG([action-if-found], [action-if-not-found]) -# ----------------------------------------------------------- -AC_DEFUN([MCA_orte_errmgr_orted_CONFIG], [ - AC_CONFIG_FILES([orte/mca/errmgr/orted/Makefile]) - - AS_IF([test "$orte_enable_resilient_code" = 1 -a "$orte_without_full_support" = 0], - [$1], - [$2]) -]) diff --git a/orte/mca/errmgr/orted/errmgr_orted.c b/orte/mca/errmgr/orted/errmgr_orted.c deleted file mode 100644 index 646f31104d..0000000000 --- a/orte/mca/errmgr/orted/errmgr_orted.c +++ /dev/null @@ -1,1157 +0,0 @@ -/* - * Copyright (c) 2009-2010 The Trustees of Indiana University. - * All rights reserved. - * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved. - * Copyright (c) 2004-2011 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "orte_config.h" - -#include -#ifdef HAVE_UNISTD_H -#include -#endif /* HAVE_UNISTD_H */ -#ifdef HAVE_STRING_H -#include -#endif - -#include "opal/util/output.h" -#include "opal/dss/dss.h" - -#include "orte/util/error_strings.h" -#include "orte/util/name_fns.h" -#include "orte/util/proc_info.h" -#include "orte/util/session_dir.h" -#include "orte/util/show_help.h" -#include "orte/util/nidmap.h" -#include "orte/runtime/orte_globals.h" -#include "orte/runtime/data_type_support/orte_dt_support.h" -#include "orte/mca/rml/rml.h" -#include "orte/mca/odls/odls.h" -#include "orte/mca/odls/base/base.h" -#include "orte/mca/odls/base/odls_private.h" -#include "orte/mca/plm/plm_types.h" -#include "orte/mca/routed/routed.h" -#include "orte/mca/sensor/sensor.h" -#include "orte/mca/ess/ess.h" -#include "orte/runtime/orte_quit.h" -#include "orte/runtime/orte_globals.h" - -#include "orte/mca/errmgr/errmgr.h" -#include "orte/mca/errmgr/base/base.h" -#include "orte/mca/errmgr/base/errmgr_private.h" - -#include "errmgr_orted.h" - -/* Local functions */ -static bool any_live_children(orte_jobid_t job); -static int pack_state_update(opal_buffer_t *alert, orte_odls_job_t *jobdat); -static int pack_state_for_proc(opal_buffer_t *alert, orte_odls_child_t *child); -static bool all_children_registered(orte_jobid_t job); -static int pack_child_contact_info(orte_jobid_t job, opal_buffer_t *buf); -static void failed_start(orte_odls_job_t *jobdat, orte_exit_code_t exit_code); -static void update_local_children(orte_odls_job_t *jobdat, - orte_job_state_t jobstate, - orte_proc_state_t state); -static void killprocs(orte_jobid_t job, orte_vpid_t vpid); -#if ORTE_RESIL_ORTE -static int mark_processes_as_dead(opal_pointer_array_t *dead_procs); -static int record_dead_process(orte_process_name_t *proc); -static int send_to_local_applications(opal_pointer_array_t *dead_names); -static void failure_notification(int status, orte_process_name_t* sender, - opal_buffer_t *buffer, orte_rml_tag_t tag, - void* cbdata); -#endif - -/* - * Module functions: Global - */ -static int init(void); -static int finalize(void); - -static int predicted_fault(opal_list_t *proc_list, - opal_list_t *node_list, - opal_list_t *suggested_map); - -static int update_state(orte_jobid_t job, - orte_job_state_t jobstate, - orte_process_name_t *proc, - orte_proc_state_t state, - pid_t pid, - orte_exit_code_t exit_code); - -static int suggest_map_targets(orte_proc_t *proc, - orte_node_t *oldnode, - opal_list_t *node_list); - -static int ft_event(int state); - - -/****************** - * orted module - ******************/ -orte_errmgr_base_module_t orte_errmgr_orted_module = { - init, - finalize, - orte_errmgr_base_log, - orte_errmgr_base_abort, - orte_errmgr_base_abort_peers, - update_state, - predicted_fault, - suggest_map_targets, - ft_event, - orte_errmgr_base_register_migration_warning -#if ORTE_RESIL_ORTE - ,orte_errmgr_base_set_fault_callback /* Set callback function */ -#endif -}; - -/************************ - * API Definitions - ************************/ -static int init(void) -{ - int ret = ORTE_SUCCESS; - -#if ORTE_RESIL_ORTE - ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_FAILURE_NOTICE, - ORTE_RML_PERSISTENT, failure_notification, NULL); -#endif - - return ret; -} - -static int finalize(void) -{ -#if ORTE_RESIL_ORTE - orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_FAILURE_NOTICE); -#endif - - return ORTE_SUCCESS; -} - -static void cbfunc(int status, orte_process_name_t* sender, - opal_buffer_t *buffer, orte_rml_tag_t tag, - void* cbdata) -{ - OBJ_RELEASE(buffer); -} - -static int update_state(orte_jobid_t job, - orte_job_state_t jobstate, - orte_process_name_t *proc, - orte_proc_state_t state, - pid_t pid, - orte_exit_code_t exit_code) -{ - opal_list_item_t *item, *next; - orte_odls_job_t *jobdat = NULL; - orte_odls_child_t *child; - opal_buffer_t *alert; - orte_plm_cmd_flag_t cmd; - int rc=ORTE_SUCCESS; - orte_vpid_t null=ORTE_VPID_INVALID; - orte_app_context_t *app; - orte_ns_cmp_bitmask_t mask; - - /* - * if orte is trying to shutdown, just let it - */ - if (orte_finalizing) { - return ORTE_SUCCESS; - } - - OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output, - "errmgr:orted:update_state() %s) " - "------- %s state updated for process %s to %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ((NULL == proc) ? "App. Process" : - (proc->jobid == ORTE_PROC_MY_HNP->jobid ? "Daemon" : "App. Process")), - (NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc), - orte_proc_state_to_str(state))); - - /* if this is a heartbeat failure, let the HNP handle it */ - if (ORTE_JOB_STATE_HEARTBEAT_FAILED == jobstate || - ORTE_PROC_STATE_HEARTBEAT_FAILED == state) { - return ORTE_SUCCESS; - } - - /*** UPDATE COMMAND FOR A JOB ***/ - if (NULL == proc) { - /* this is an update for an entire job */ - if (ORTE_JOBID_INVALID == job) { - /* whatever happened, we don't know what job - * it happened to - */ - orte_show_help("help-orte-errmgr-orted.txt", "errmgr-orted:unknown-job-error", - true, orte_job_state_to_str(jobstate)); - alert = OBJ_NEW(opal_buffer_t); - /* pack update state command */ - cmd = ORTE_PLM_UPDATE_PROC_STATE; - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* pack the "invalid" jobid */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &job, 1, ORTE_JOBID))) { - ORTE_ERROR_LOG(rc); - return rc; - } - if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, 0, cbfunc, NULL))) { - ORTE_ERROR_LOG(rc); - } else { - rc = ORTE_SUCCESS; - } - return rc; - } - - /* lookup the local jobdat for this job */ - jobdat = NULL; - for (item = opal_list_get_first(&orte_local_jobdata); - item != opal_list_get_end(&orte_local_jobdata); - item = opal_list_get_next(item)) { - jobdat = (orte_odls_job_t*)item; - - /* is this the specified job? */ - if (jobdat->jobid == job) { - break; - } - } - if (NULL == jobdat) { - return ORTE_ERR_NOT_FOUND; - } - - switch (jobstate) { - case ORTE_JOB_STATE_FAILED_TO_START: - failed_start(jobdat, exit_code); - break; - case ORTE_JOB_STATE_RUNNING: - /* update all local child states */ - update_local_children(jobdat, jobstate, ORTE_PROC_STATE_RUNNING); - break; - case ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED: - /* update all procs in job */ - update_local_children(jobdat, jobstate, ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED); - /* order all local procs for this job to be killed */ - killprocs(jobdat->jobid, ORTE_VPID_WILDCARD); - case ORTE_JOB_STATE_COMM_FAILED: - /* kill all local procs */ - killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); - /* tell the caller we can't recover */ - return ORTE_ERR_UNRECOVERABLE; - break; - case ORTE_JOB_STATE_HEARTBEAT_FAILED: - /* let the HNP handle this */ - return ORTE_SUCCESS; - break; - - default: - break; - } - alert = OBJ_NEW(opal_buffer_t); - /* pack update state command */ - cmd = ORTE_PLM_UPDATE_PROC_STATE; - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { - ORTE_ERROR_LOG(rc); - goto FINAL_CLEANUP; - } - /* pack the job info */ - if (ORTE_SUCCESS != (rc = pack_state_update(alert, jobdat))) { - ORTE_ERROR_LOG(rc); - } - /* send it */ - if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, 0, cbfunc, NULL))) { - ORTE_ERROR_LOG(rc); - } else { - rc = ORTE_SUCCESS; - } - return rc; - } - - /* if this was a failed comm, then see if it was to our - * lifeline - */ - if (ORTE_PROC_STATE_COMM_FAILED == state) { - mask = ORTE_NS_CMP_ALL; - - /* if it is our own connection, ignore it */ - if (OPAL_EQUAL == orte_util_compare_name_fields(mask, ORTE_PROC_MY_NAME, proc)) { - return ORTE_SUCCESS; - } - /* see if this was a lifeline */ - if (ORTE_SUCCESS != orte_routed.route_lost(proc)) { - /* kill our children */ - killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); - /* terminate - our routed children will see - * us leave and automatically die - */ - orte_quit(); - } - /* purge the oob */ - orte_rml.purge(proc); - /* was it a daemon that failed? */ - if (proc->jobid == ORTE_PROC_MY_NAME->jobid) { - /* if all my routes are gone, then terminate ourselves */ - if (0 == orte_routed.num_routes() && - 0 == opal_list_get_size(&orte_local_children)) { - orte_quit(); - } else { - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:orted not exiting, num_routes() == %d, num children == %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - (int)orte_routed.num_routes(), - (int)opal_list_get_size(&orte_local_children))); - } - } - -#if ORTE_RESIL_ORTE - record_dead_process(proc); -#endif - - /* if not, then indicate we can continue */ - return ORTE_SUCCESS; - } - - /* lookup the local jobdat for this job */ - jobdat = NULL; - for (item = opal_list_get_first(&orte_local_jobdata); - item != opal_list_get_end(&orte_local_jobdata); - item = opal_list_get_next(item)) { - jobdat = (orte_odls_job_t*)item; - - /* is this the specified job? */ - if (jobdat->jobid == proc->jobid) { - break; - } - } - if (NULL == jobdat) { - /* must already be complete */ - return ORTE_SUCCESS; - } - - /* if there are no local procs for this job, we can - * ignore this call - */ - if (0 == jobdat->num_local_procs) { - return ORTE_SUCCESS; - } - - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:orted got state %s for proc %s pid %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - orte_proc_state_to_str(state), - ORTE_NAME_PRINT(proc), pid)); - - /*** UPDATE COMMAND FOR A SPECIFIC PROCESS ***/ - if (ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED == state) { - /* find this proc in the local children */ - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; - mask = ORTE_NS_CMP_ALL; - if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) { - if (ORTE_PROC_STATE_UNTERMINATED > child->state) { - child->state = state; - child->exit_code = exit_code; - /* Decrement the number of local procs */ - jobdat->num_local_procs--; - /* kill this proc */ - killprocs(proc->jobid, proc->vpid); - } - app = (orte_app_context_t*)opal_pointer_array_get_item(&jobdat->apps, child->app_idx); - if( jobdat->enable_recovery && child->restarts < app->max_restarts ) { - child->restarts++; - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:orted restarting proc %s for the %d time", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(proc), child->restarts)); - rc = orte_odls.restart_proc(child); - } - return rc; - } - } - } - - if (ORTE_PROC_STATE_TERM_NON_ZERO == state) { - if (orte_abort_non_zero_exit) { - /* treat this as an abnormal - * termination - no recovery allowed - */ - goto REPORT_ABORT; - } - /* treat this as normal termination */ - goto REPORT_STATE; - } - - if (ORTE_PROC_STATE_TERMINATED < state) { - if( jobdat->enable_recovery ) { - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s RECOVERY ENABLED", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - /* find this proc in the local children */ - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; - mask = ORTE_NS_CMP_ALL; - if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) { - /* see if this child has reached its local restart limit */ - app = (orte_app_context_t*)opal_pointer_array_get_item(&jobdat->apps, child->app_idx); - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s CHECKING RESTARTS %d VS MAX %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - child->restarts, app->max_restarts)); - if (child->restarts < app->max_restarts ) { - /* attempt to restart it locally */ - child->restarts++; - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:orted restarting proc %s for the %d time", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name), child->restarts)); - if (ORTE_SUCCESS != (rc = orte_odls.restart_proc(child))) { - /* reset the child's state as restart_proc would - * have cleared it - */ - child->state = state; - ORTE_ERROR_LOG(rc); - goto REPORT_ABORT; - } - return ORTE_SUCCESS; - } - } - } - } - -REPORT_ABORT: - /* if the job hasn't completed and the state is abnormally - * terminated, then we need to alert the HNP right away - */ - alert = OBJ_NEW(opal_buffer_t); - /* pack update state command */ - cmd = ORTE_PLM_UPDATE_PROC_STATE; - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { - ORTE_ERROR_LOG(rc); - goto FINAL_CLEANUP; - } - /* pack only the data for this proc - have to start with the jobid - * so the receiver can unpack it correctly - */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /* find this proc in the local children */ - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; - mask = ORTE_NS_CMP_ALL; - if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) { - if (ORTE_PROC_STATE_UNTERMINATED > child->state) { - child->state = state; - child->exit_code = exit_code; - } - /* now pack the child's info */ - if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* remove the child from our local list as it is no longer alive */ - opal_list_remove_item(&orte_local_children, &child->super); - /* Decrement the number of local procs */ - jobdat->num_local_procs--; - - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:orted reporting proc %s aborted to HNP (local procs = %d)", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name), - jobdat->num_local_procs)); - - /* release the child object */ - OBJ_RELEASE(child); - /* done with loop */ - break; - } - } - - /* send it */ - if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, 0, cbfunc, NULL))) { - ORTE_ERROR_LOG(rc); - } else { - rc = ORTE_SUCCESS; - } - return rc; - } - - REPORT_STATE: - /* find this proc in the local children so we can update its state */ - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; - mask = ORTE_NS_CMP_ALL; - if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) { - if (ORTE_PROC_STATE_UNTERMINATED > child->state) { - child->state = state; - if (0 < pid) { - child->pid = pid; - } - child->exit_code = exit_code; - } - /* done with loop */ - break; - } - } - - if (ORTE_PROC_STATE_REGISTERED == state) { - /* see if everyone in this job has registered */ - if (all_children_registered(proc->jobid)) { - /* once everyone registers, send their contact info to - * the HNP so it is available to debuggers and anyone - * else that needs it - */ - - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:orted: sending contact info to HNP", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - alert = OBJ_NEW(opal_buffer_t); - /* pack init routes command */ - cmd = ORTE_PLM_INIT_ROUTES_CMD; - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { - ORTE_ERROR_LOG(rc); - goto FINAL_CLEANUP; - } - /* pack the jobid */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) { - ORTE_ERROR_LOG(rc); - goto FINAL_CLEANUP; - } - /* pack all the local child vpids and epochs */ - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; - if (child->name->jobid == proc->jobid) { - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &child->name->vpid, 1, ORTE_VPID))) { - ORTE_ERROR_LOG(rc); - goto FINAL_CLEANUP; - } -#if ORTE_ENABLE_EPOCH - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &child->name->epoch, 1, ORTE_EPOCH))) { - ORTE_ERROR_LOG(rc); - goto FINAL_CLEANUP; - } -#endif - } - } - /* pack an invalid marker */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &null, 1, ORTE_VPID))) { - ORTE_ERROR_LOG(rc); - goto FINAL_CLEANUP; - } - /* add in contact info for all procs in the job */ - if (ORTE_SUCCESS != (rc = pack_child_contact_info(proc->jobid, alert))) { - ORTE_ERROR_LOG(rc); - OBJ_DESTRUCT(&alert); - return rc; - } - /* send it */ - if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, 0, cbfunc, NULL))) { - ORTE_ERROR_LOG(rc); - } else { - rc = ORTE_SUCCESS; - } - } - return rc; - } - - /* only other state is terminated - see if anyone is left alive */ - if (!any_live_children(proc->jobid)) { - /* lookup the local jobdat for this job */ - jobdat = NULL; - for (item = opal_list_get_first(&orte_local_jobdata); - item != opal_list_get_end(&orte_local_jobdata); - item = opal_list_get_next(item)) { - jobdat = (orte_odls_job_t*)item; - - /* is this the specified job? */ - if (jobdat->jobid == proc->jobid) { - break; - } - } - if (NULL == jobdat) { - /* race condition - may not have been formed yet */ - return ORTE_SUCCESS; - } - - alert = OBJ_NEW(opal_buffer_t); - /* pack update state command */ - cmd = ORTE_PLM_UPDATE_PROC_STATE; - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { - ORTE_ERROR_LOG(rc); - goto FINAL_CLEANUP; - } - /* pack the data for the job */ - if (ORTE_SUCCESS != (rc = pack_state_update(alert, jobdat))) { - ORTE_ERROR_LOG(rc); - } - -FINAL_CLEANUP: - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:orted reporting all procs in %s terminated", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(jobdat->jobid))); - - /* remove all of this job's children from the global list - do not lock - * the thread as we are already locked - */ - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = next) { - child = (orte_odls_child_t*)item; - next = opal_list_get_next(item); - - if (jobdat->jobid == child->name->jobid) { - opal_list_remove_item(&orte_local_children, &child->super); - OBJ_RELEASE(child); - } - } - - /* ensure the job's local session directory tree is removed */ - orte_session_dir_cleanup(jobdat->jobid); - - /* remove this job from our local job data since it is complete */ - opal_list_remove_item(&orte_local_jobdata, &jobdat->super); - OBJ_RELEASE(jobdat); - - /* send it */ - if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, 0, cbfunc, NULL))) { - ORTE_ERROR_LOG(rc); - } else { - rc = ORTE_SUCCESS; - } - - /* indicate that the job is complete */ - return rc; - } - return ORTE_SUCCESS; -} - -static int predicted_fault(opal_list_t *proc_list, - opal_list_t *node_list, - opal_list_t *suggested_map) -{ - return ORTE_ERR_NOT_IMPLEMENTED; -} - -static int suggest_map_targets(orte_proc_t *proc, - orte_node_t *oldnode, - opal_list_t *node_list) -{ - return ORTE_ERR_NOT_IMPLEMENTED; -} - -static int ft_event(int state) -{ - return ORTE_SUCCESS; -} - -#if ORTE_RESIL_ORTE -static int mark_processes_as_dead(opal_pointer_array_t *dead_procs) { - int i; - orte_process_name_t *name_item; - opal_list_item_t *item; - orte_odls_child_t *child; - - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "ORTED %s marking procs as dead", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - for (i = 0; i < opal_pointer_array_get_size(dead_procs); i++) { - if (NULL == (name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_procs, i))) { - opal_output(0, "NULL found in dead process list."); - continue; - } - - if (0 < ORTE_EPOCH_CMP(name_item->epoch,orte_ess.proc_get_epoch(name_item))) { - continue; - } - - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "ORTED %s marking %s as dead", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(name_item))); - -#if ORTE_ENABLE_EPOCH - /* Increment the epoch */ - orte_util_set_proc_state(name_item, ORTE_PROC_STATE_TERMINATED); - orte_util_set_epoch(name_item, name_item->epoch + 1); -#endif - - OPAL_THREAD_LOCK(&orte_odls_globals.mutex); - - /* Remove the dead process from my list of children if applicable */ - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t *) item; - - if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID, - child->name, name_item)) { - opal_list_remove_item(&orte_local_children, item); - OBJ_RELEASE(item); - break; - } - } - - opal_condition_signal(&orte_odls_globals.cond); - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); - - /* Remove the route from the routing layer */ - orte_routed.delete_route(name_item); - } - - /* Update the routing module */ - orte_routed.update_routing_tree(ORTE_PROC_MY_NAME->jobid); - - if (NULL != fault_cbfunc) { - (*fault_cbfunc)(dead_procs); - } - - return ORTE_SUCCESS; -} - -static void failure_notification(int status, orte_process_name_t* sender, - opal_buffer_t *buffer, orte_rml_tag_t tag, - void* cbdata) -{ - opal_pointer_array_t *dead_names; - orte_std_cntr_t n; - int ret = ORTE_SUCCESS, num_failed; - int32_t i; - orte_process_name_t *name_item; - - dead_names = OBJ_NEW(opal_pointer_array_t); - - n = 1; - /* Get the number of failed procs */ - if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &num_failed, &n, ORTE_VPID))) { - ORTE_ERROR_LOG(ret); - return; - } - - for (i = 0; i < num_failed; i++) { - /* Unpack the buffer to get the dead process' name. */ - n = 1; - - name_item = (orte_process_name_t *) malloc(sizeof(orte_process_name_t)); - - if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, name_item, &n, ORTE_NAME))) { - ORTE_ERROR_LOG(ret); - return; - } - - if (orte_debug_daemons_flag) { - opal_output(0, "%s errmgr:orted ORTED received process %s failed from %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(name_item), - ORTE_NAME_PRINT(sender)); - } - - /* There shouldn't be an issue of receiving this message multiple - * times but it doesn't hurt to double check. - */ - if (0 < ORTE_EPOCH_CMP(name_item->epoch,orte_ess.proc_get_epoch(name_item))) { - opal_output(1, "Received from proc %s local epoch %d", ORTE_NAME_PRINT(name_item), orte_util_lookup_epoch(name_item)); - continue; - } - - opal_pointer_array_add(dead_names, name_item); - } - - /* Tell the errmgr so it can handle changing the epoch, routes, etc. */ - mark_processes_as_dead(dead_names); - - /* Tell the applications' ORTE layers that there is a failure. */ - if (ORTE_SUCCESS != (ret = send_to_local_applications(dead_names))) { - return; - } - - for (i = 0; i < num_failed; i++) { - name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_names, i); - free(name_item); - } -} -#endif - -/***************** - * Local Functions - *****************/ -static bool any_live_children(orte_jobid_t job) -{ - opal_list_item_t *item; - orte_odls_child_t *child; - - /* the thread is locked elsewhere - don't try to do it again here */ - - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; - - /* is this child part of the specified job? */ - if ((job == child->name->jobid || ORTE_JOBID_WILDCARD == job) && - child->alive) { - return true; - } - } - - /* if we get here, then nobody is left alive from that job */ - return false; - -} - -static int pack_state_for_proc(opal_buffer_t *alert, orte_odls_child_t *child) -{ - int rc; - - /* pack the child's vpid */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &(child->name->vpid), 1, ORTE_VPID))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* pack the pid */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &child->pid, 1, OPAL_PID))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* if we are timing things, pack the time the proc was launched */ - if (orte_timing) { - int64_t tmp; - tmp = child->starttime.tv_sec; - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) { - ORTE_ERROR_LOG(rc); - return rc; - } - tmp = child->starttime.tv_usec; - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) { - ORTE_ERROR_LOG(rc); - return rc; - } - } - /* pack its state */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &child->state, 1, ORTE_PROC_STATE))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* pack its exit code */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &child->exit_code, 1, ORTE_EXIT_CODE))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - return ORTE_SUCCESS; -} - -static int pack_state_update(opal_buffer_t *alert, orte_odls_job_t *jobdat) -{ - int rc; - opal_list_item_t *item, *next; - orte_odls_child_t *child; - orte_vpid_t null=ORTE_VPID_INVALID; - - /* pack the jobid */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &jobdat->jobid, 1, ORTE_JOBID))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* if we are timing things, pack the time the launch msg for this job was recvd */ - if (orte_timing) { - int64_t tmp; - tmp = jobdat->launch_msg_recvd.tv_sec; - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) { - ORTE_ERROR_LOG(rc); - return rc; - } - tmp = jobdat->launch_msg_recvd.tv_usec; - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) { - ORTE_ERROR_LOG(rc); - return rc; - } - } - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = next) { - child = (orte_odls_child_t*)item; - next = opal_list_get_next(item); - /* if this child is part of the job... */ - if (child->name->jobid == jobdat->jobid) { - if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) { - ORTE_ERROR_LOG(rc); - return rc; - } - } - } - /* flag that this job is complete so the receiver can know */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &null, 1, ORTE_VPID))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - return ORTE_SUCCESS; -} - -static bool all_children_registered(orte_jobid_t job) -{ - opal_list_item_t *item; - orte_odls_child_t *child; - - /* the thread is locked elsewhere - don't try to do it again here */ - - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; - - /* is this child part of the specified job? */ - if (OPAL_EQUAL == opal_dss.compare(&child->name->jobid, &job, ORTE_JOBID)) { - /* if this child has terminated, we consider it as having - * registered for the purposes of this function. If it never - * did register, then we will send a NULL rml_uri back to - * the HNP, which will then know that the proc did not register. - * If other procs did register, then the HNP can declare an - * abnormal termination - */ - if (ORTE_PROC_STATE_UNTERMINATED < child->state) { - /* this proc has terminated somehow - consider it - * as registered for now - */ - continue; - } - /* if this child is *not* registered yet, return false */ - if (!child->init_recvd) { - return false; - } - /* if this child has registered a finalize, return false */ - if (child->fini_recvd) { - return false; - } - } - } - - /* if we get here, then everyone in the job is currently registered */ - return true; - -} - -static int pack_child_contact_info(orte_jobid_t job, opal_buffer_t *buf) -{ - opal_list_item_t *item; - orte_odls_child_t *child; - int rc; - - /* the thread is locked elsewhere - don't try to do it again here */ - - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; - - /* is this child part of the specified job? */ - if (OPAL_EQUAL == opal_dss.compare(&child->name->jobid, &job, ORTE_JOBID)) { - /* pack the child's vpid - must be done in case rml_uri is NULL */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &(child->name->vpid), 1, ORTE_VPID))) { - ORTE_ERROR_LOG(rc); - return rc; - } -#if ORTE_ENABLE_EPOCH - /* Pack the child's epoch. */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &(child->name->epoch), 1, ORTE_EPOCH))) { - ORTE_ERROR_LOG(rc); - return rc; - } -#endif - /* pack the contact info */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &child->rml_uri, 1, OPAL_STRING))) { - ORTE_ERROR_LOG(rc); - return rc; - } - } - } - - return ORTE_SUCCESS; - -} - -static void failed_start(orte_odls_job_t *jobdat, orte_exit_code_t exit_code) -{ - opal_list_item_t *item; - orte_odls_child_t *child; - - /* set the state */ - jobdat->state = ORTE_JOB_STATE_FAILED_TO_START; - - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; - if (child->name->jobid == jobdat->jobid) { - if (ORTE_PROC_STATE_LAUNCHED > child->state || - ORTE_PROC_STATE_FAILED_TO_START == child->state) { - /* this proc never launched - flag that the iof - * is complete or else we will hang waiting for - * pipes to close that were never opened - */ - child->iof_complete = true; - /* ditto for waitpid */ - child->waitpid_recvd = true; - } - } - } - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "%s errmgr:hnp: job %s reported incomplete start", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(jobdat->jobid))); - return; -} - -static void update_local_children(orte_odls_job_t *jobdat, orte_job_state_t jobstate, orte_proc_state_t state) -{ - opal_list_item_t *item; - orte_odls_child_t *child; - - /* update job state */ - jobdat->state = jobstate; - /* update children */ - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; - if (jobdat->jobid == child->name->jobid) { - child->state = state; - } - } -} - -static void killprocs(orte_jobid_t job, orte_vpid_t vpid) -{ - opal_pointer_array_t cmd; - orte_proc_t proc; - int rc; - - /* stop local sensors for this job */ - if (ORTE_VPID_WILDCARD == vpid) { - orte_sensor.stop(job); - } - - if (ORTE_JOBID_WILDCARD == job - && ORTE_VPID_WILDCARD == vpid) { - if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(NULL))) { - ORTE_ERROR_LOG(rc); - } - return; - } - - OBJ_CONSTRUCT(&cmd, opal_pointer_array_t); - OBJ_CONSTRUCT(&proc, orte_proc_t); - proc.name.jobid = job; - proc.name.vpid = vpid; - ORTE_EPOCH_SET(proc.name.epoch,orte_ess.proc_get_epoch(&(proc.name))); - opal_pointer_array_add(&cmd, &proc); - if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(&cmd))) { - ORTE_ERROR_LOG(rc); - } - OBJ_DESTRUCT(&cmd); - OBJ_DESTRUCT(&proc); -} - -#if ORTE_RESIL_ORTE -static int record_dead_process(orte_process_name_t *proc) { - opal_pointer_array_t *dead_name; - opal_buffer_t *buffer; - int rc = ORTE_SUCCESS; - int num_failed; - - if (orte_odls_base_default_check_finished(proc)) { - return rc; - } - - dead_name = OBJ_NEW(opal_pointer_array_t); - - opal_pointer_array_add(dead_name, proc); - - /* Mark the process as dead */ - mark_processes_as_dead(dead_name); - - /* Send a message to the HNP */ - buffer = OBJ_NEW(opal_buffer_t); - - num_failed = 1; - - if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &num_failed, 1, ORTE_VPID))) { - ORTE_ERROR_LOG(rc); - } else if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, proc, 1, ORTE_NAME))) { - ORTE_ERROR_LOG(rc); - } - - orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buffer, ORTE_RML_TAG_FAILURE_NOTICE, 0, - cbfunc, NULL); - - OBJ_RELEASE(dead_name); - - return rc; -} - -int send_to_local_applications(opal_pointer_array_t *dead_names) { - opal_buffer_t *buf; - int ret; - orte_process_name_t *name_item; - int size, i; - - buf = OBJ_NEW(opal_buffer_t); - - size = opal_pointer_array_get_size(dead_names); - - OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output, - "%s Sending %d failure(s) to local applications.", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), size)); - - if (ORTE_SUCCESS != (ret = opal_dss.pack(buf, &size, 1, ORTE_VPID))) { - ORTE_ERROR_LOG(ret); - OBJ_RELEASE(buf); - return ret; - } - - for (i = 0; i < size; i++) { - if (NULL != (name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_names, i))) { - if (ORTE_SUCCESS != (ret = opal_dss.pack(buf, name_item, 1, ORTE_NAME))) { - ORTE_ERROR_LOG(ret); - OBJ_RELEASE(buf); - return ret; - } - } - } - - if (ORTE_SUCCESS != (ret = orte_odls.deliver_message(ORTE_JOBID_WILDCARD, buf, ORTE_RML_TAG_EPOCH_CHANGE))) { - ORTE_ERROR_LOG(ret); - OBJ_RELEASE(buf); - return ret; - } - - OBJ_RELEASE(buf); - - return ORTE_SUCCESS; -} -#endif - diff --git a/orte/mca/errmgr/orted/errmgr_orted.h b/orte/mca/errmgr/orted/errmgr_orted.h deleted file mode 100644 index 2c3e22f1be..0000000000 --- a/orte/mca/errmgr/orted/errmgr_orted.h +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. - * - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -/** - * @file - * - */ - -#ifndef MCA_ERRMGR_orted_EXPORT_H -#define MCA_ERRMGR_orted_EXPORT_H - -#include "orte_config.h" - -#include "orte/mca/errmgr/errmgr.h" - -BEGIN_C_DECLS - -/* - * Local Component structures - */ - -ORTE_MODULE_DECLSPEC extern orte_errmgr_base_component_t mca_errmgr_orted_component; - -ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr_orted_module; - -END_C_DECLS - -#endif /* MCA_ERRMGR_orted_EXPORT_H */ diff --git a/orte/mca/errmgr/orted/errmgr_orted_component.c b/orte/mca/errmgr/orted/errmgr_orted_component.c deleted file mode 100644 index d3ecc83021..0000000000 --- a/orte/mca/errmgr/orted/errmgr_orted_component.c +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. - * - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "orte_config.h" -#include "opal/util/output.h" - -#include "orte/mca/errmgr/errmgr.h" -#include "orte/mca/errmgr/base/base.h" -#include "errmgr_orted.h" - -/* - * Public string for version number - */ -const char *orte_errmgr_orted_component_version_string = - "ORTE ERRMGR orted MCA component version " ORTE_VERSION; - -/* - * Local functionality - */ -static int errmgr_orted_open(void); -static int errmgr_orted_close(void); -static int errmgr_orted_component_query(mca_base_module_t **module, int *priority); - -/* - * Instantiate the public struct with all of our public information - * and pointer to our public functions in it - */ -orte_errmgr_base_component_t mca_errmgr_orted_component = -{ - /* Handle the general mca_component_t struct containing - * meta information about the component itorted - */ - { - ORTE_ERRMGR_BASE_VERSION_3_0_0, - /* Component name and version */ - "orted", - ORTE_MAJOR_VERSION, - ORTE_MINOR_VERSION, - ORTE_RELEASE_VERSION, - - /* Component open and close functions */ - errmgr_orted_open, - errmgr_orted_close, - errmgr_orted_component_query - }, - { - /* The component is checkpoint ready */ - MCA_BASE_METADATA_PARAM_CHECKPOINT - } -}; - -static int errmgr_orted_open(void) -{ - return ORTE_SUCCESS; -} - -static int errmgr_orted_close(void) -{ - return ORTE_SUCCESS; -} - -static int errmgr_orted_component_query(mca_base_module_t **module, int *priority) -{ - if (ORTE_PROC_IS_DAEMON) { - /* keep our priority low so that other modules are higher - * and will run before us - */ - *priority = 5; - *module = (mca_base_module_t *)&orte_errmgr_orted_module; - return ORTE_SUCCESS; - } - - *priority = -1; - *module = NULL; - return ORTE_ERROR; -} - diff --git a/orte/mca/errmgr/orted/help-orte-errmgr-orted.txt b/orte/mca/errmgr/orted/help-orte-errmgr-orted.txt deleted file mode 100644 index c6d43f1f77..0000000000 --- a/orte/mca/errmgr/orted/help-orte-errmgr-orted.txt +++ /dev/null @@ -1,14 +0,0 @@ - -*- text -*- -# -# Copyright (c) 2009-2010 The Trustees of Indiana University and Indiana -# University Research and Technology -# Corporation. All rights reserved. -# -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# -# This is the US/English general help file for ORTE RecoS IGNORE framework. -# diff --git a/orte/mca/ess/alps/ess_alps_module.c b/orte/mca/ess/alps/ess_alps_module.c index 5002ac1c70..6afaab6787 100644 --- a/orte/mca/ess/alps/ess_alps_module.c +++ b/orte/mca/ess/alps/ess_alps_module.c @@ -58,7 +58,6 @@ orte_ess_base_module_t orte_ess_alps_module = { orte_ess_base_proc_get_hostname, orte_ess_base_proc_get_local_rank, orte_ess_base_proc_get_node_rank, - orte_ess_base_proc_get_epoch, orte_ess_base_update_pidmap, orte_ess_base_update_nidmap, NULL /* ft_event */ @@ -221,9 +220,6 @@ static int alps_set_name(void) ORTE_PROC_MY_NAME->jobid = jobid; ORTE_PROC_MY_NAME->vpid = (orte_vpid_t)cnos_get_rank() + starting_vpid; - ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,ORTE_EPOCH_INVALID); - ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch, - orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME)); OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, "ess:alps set name to %s", diff --git a/orte/mca/ess/base/base.h b/orte/mca/ess/base/base.h index 4047423d80..b494e530fa 100644 --- a/orte/mca/ess/base/base.h +++ b/orte/mca/ess/base/base.h @@ -64,12 +64,6 @@ ORTE_DECLSPEC extern int orte_ess_base_std_buffering; ORTE_DECLSPEC extern opal_list_t orte_ess_base_components_available; -#if ORTE_ENABLE_EPOCH -ORTE_DECLSPEC orte_epoch_t orte_ess_base_proc_get_epoch(orte_process_name_t *proc); -#else -ORTE_DECLSPEC int orte_ess_base_proc_get_epoch(orte_process_name_t *proc); -#endif - #if !ORTE_DISABLE_FULL_SUPPORT /* @@ -81,7 +75,7 @@ ORTE_DECLSPEC int orte_ess_base_std_prolog(void); ORTE_DECLSPEC int orte_ess_base_app_setup(void); ORTE_DECLSPEC int orte_ess_base_app_finalize(void); -ORTE_DECLSPEC void orte_ess_base_app_abort(int status, bool report) __opal_attribute_noreturn__; +ORTE_DECLSPEC void orte_ess_base_app_abort(int status, bool report); ORTE_DECLSPEC int orte_ess_base_tool_setup(void); ORTE_DECLSPEC int orte_ess_base_tool_finalize(void); diff --git a/orte/mca/ess/base/ess_base_fns.c b/orte/mca/ess/base/ess_base_fns.c index be7141a636..e5b1f4a5fc 100644 --- a/orte/mca/ess/base/ess_base_fns.c +++ b/orte/mca/ess/base/ess_base_fns.c @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -43,6 +45,10 @@ opal_paffinity_locality_t orte_ess_base_proc_get_locality(orte_process_name_t *p if (NULL == (pmap = orte_util_lookup_pmap(proc))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output, + "%s LOOKING FOR PROC %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc))); return OPAL_PROC_NON_LOCAL; } @@ -76,6 +82,10 @@ char* orte_ess_base_proc_get_hostname(orte_process_name_t *proc) if (NULL == (nid = orte_util_lookup_nid(proc))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output, + "%s LOOKING FOR PROC %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc))); return NULL; } diff --git a/orte/mca/ess/base/ess_base_open.c b/orte/mca/ess/base/ess_base_open.c index 8bf42512bf..dfa3722493 100644 --- a/orte/mca/ess/base/ess_base_open.c +++ b/orte/mca/ess/base/ess_base_open.c @@ -46,7 +46,6 @@ orte_ess_base_module_t orte_ess = { NULL, /* proc_get_hostname */ NULL, /* get_local_rank */ NULL, /* get_node_rank */ - NULL, /* proc_get_epoch */ NULL, /* update_pidmap */ NULL, /* update_nidmap */ NULL /* ft_event */ diff --git a/orte/mca/ess/base/ess_base_select.c b/orte/mca/ess/base/ess_base_select.c index 5d03f59448..832f9d77c0 100644 --- a/orte/mca/ess/base/ess_base_select.c +++ b/orte/mca/ess/base/ess_base_select.c @@ -33,24 +33,6 @@ extern opal_list_t orte_ess_base_components_available; -/** - * Generic function to retrieve the epoch of a specific process - * from the job data. - */ -#if !ORTE_ENABLE_EPOCH -int orte_ess_base_proc_get_epoch(orte_process_name_t *proc) { - return 0; -} -#else -orte_epoch_t orte_ess_base_proc_get_epoch(orte_process_name_t *proc) { - orte_epoch_t epoch = ORTE_EPOCH_INVALID; - - epoch = orte_util_lookup_epoch(proc); - - return epoch; -} -#endif - int orte_ess_base_select(void) { diff --git a/orte/mca/ess/base/ess_base_std_app.c b/orte/mca/ess/base/ess_base_std_app.c index 9063e99e30..cb134acb59 100644 --- a/orte/mca/ess/base/ess_base_std_app.c +++ b/orte/mca/ess/base/ess_base_std_app.c @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2010-2012 Oak Ridge National Labs. All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -35,6 +37,7 @@ #include "opal/util/output.h" #include "opal/runtime/opal.h" #include "opal/runtime/opal_cr.h" +#include "opal/runtime/opal_progress.h" #include "orte/mca/rml/base/base.h" #include "orte/mca/routed/base/base.h" @@ -48,6 +51,7 @@ #if OPAL_ENABLE_FT_CR == 1 #include "orte/mca/snapc/base/base.h" #endif +#include "orte/mca/state/base/base.h" #include "orte/util/proc_info.h" #include "orte/util/session_dir.h" #include "orte/util/name_fns.h" @@ -85,6 +89,18 @@ int orte_ess_base_app_setup(void) } } + /* open and setup the state machine */ + if (ORTE_SUCCESS != (ret = orte_state_base_open())) { + ORTE_ERROR_LOG(ret); + error = "orte_state_base_open"; + goto error; + } + if (ORTE_SUCCESS != (ret = orte_state_base_select())) { + ORTE_ERROR_LOG(ret); + error = "orte_state_base_select"; + goto error; + } + /* open the errmgr */ if (ORTE_SUCCESS != (ret = orte_errmgr_base_open())) { ORTE_ERROR_LOG(ret); @@ -248,11 +264,18 @@ int orte_ess_base_app_setup(void) * in the job won't be executing this step, so we would hang */ if (ORTE_PROC_IS_NON_MPI && !orte_do_not_barrier) { - if (ORTE_SUCCESS != (ret = orte_grpcomm.barrier())) { + orte_grpcomm_collective_t coll; + OBJ_CONSTRUCT(&coll, orte_grpcomm_collective_t); + coll.id = orte_process_info.peer_init_barrier; + if (ORTE_SUCCESS != (ret = orte_grpcomm.barrier(&coll))) { ORTE_ERROR_LOG(ret); error = "orte barrier"; goto error; } + while (coll.active) { + opal_progress(); /* block in progress pending events */ + } + OBJ_DESTRUCT(&coll); } return ORTE_SUCCESS; @@ -324,12 +347,18 @@ static void report_sync(int status, orte_process_name_t* sender, { /* flag as complete */ sync_recvd = true; + + /* (not really necessary, but good practice) */ + orte_proc_info_finalize(); + + /* Now Exit */ + exit(status); } void orte_ess_base_app_abort(int status, bool report) { orte_daemon_cmd_flag_t cmd=ORTE_DAEMON_ABORT_CALLED; - opal_buffer_t buf; + opal_buffer_t *buf; /* Exit - do NOT do a normal finalize as this will very likely * hang the process. We are aborting due to an abnormal condition @@ -345,10 +374,9 @@ void orte_ess_base_app_abort(int status, bool report) /* If we were asked to report this termination, do so */ if (report) { - OBJ_CONSTRUCT(&buf, opal_buffer_t); - opal_dss.pack(&buf, &cmd, 1, ORTE_DAEMON_CMD); - orte_rml.send_buffer(ORTE_PROC_MY_DAEMON, &buf, ORTE_RML_TAG_DAEMON, 0); - OBJ_DESTRUCT(&buf); + buf = OBJ_NEW(opal_buffer_t); + opal_dss.pack(buf, &cmd, 1, ORTE_DAEMON_CMD); + orte_rml.send_buffer_nb(ORTE_PROC_MY_DAEMON, buf, ORTE_RML_TAG_DAEMON, 0, orte_rml_send_callback, NULL); OPAL_OUTPUT_VERBOSE((5, orte_debug_output, "%s orte_ess_app_abort: sent abort msg to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), @@ -360,7 +388,7 @@ void orte_ess_base_app_abort(int status, bool report) sync_recvd = false; if (ORTE_SUCCESS == orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ABORT, ORTE_RML_NON_PERSISTENT, report_sync, NULL)) { - ORTE_PROGRESSED_WAIT(sync_recvd, 0, 1); + return; } } diff --git a/orte/mca/ess/base/ess_base_std_orted.c b/orte/mca/ess/base/ess_base_std_orted.c index 77e3dd104b..bed6f73d83 100644 --- a/orte/mca/ess/base/ess_base_std_orted.c +++ b/orte/mca/ess/base/ess_base_std_orted.c @@ -12,6 +12,8 @@ * Copyright (c) 2009 Institut National de Recherche en Informatique * et Automatique. All rights reserved. * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -62,6 +64,8 @@ #include "orte/mca/notifier/base/base.h" #include "orte/mca/sensor/base/base.h" #include "orte/mca/sensor/sensor.h" +#include "orte/mca/state/base/base.h" +#include "orte/mca/state/state.h" #include "orte/runtime/orte_cr.h" #include "orte/runtime/orte_wait.h" #include "orte/runtime/orte_globals.h" @@ -84,6 +88,15 @@ static void shutdown_signal(int fd, short flags, void *arg); static void signal_callback(int fd, short flags, void *arg); static void epipe_signal_callback(int fd, short flags, void *arg); +static void setup_sighandler(int signal, opal_event_t *ev, + opal_event_cbfunc_t cbfunc) +{ + opal_event_signal_set(orte_event_base, ev, signal, cbfunc, ev); + opal_event_set_priority(ev, ORTE_ERROR_PRI); + opal_event_signal_add(ev, NULL); +} + + int orte_ess_base_orted_setup(char **hosts) { int ret = ORTE_ERROR; @@ -92,42 +105,32 @@ int orte_ess_base_orted_setup(char **hosts) char *jobidstring; char *error = NULL; char *plm_to_use; + orte_job_t *jdata; + orte_proc_t *proc; + orte_app_context_t *app; #ifndef __WINDOWS__ /* setup callback for SIGPIPE */ - opal_event_signal_set(opal_event_base, &epipe_handler, SIGPIPE, - epipe_signal_callback, &epipe_handler); - opal_event_signal_add(&epipe_handler, NULL); + setup_sighandler(SIGPIPE, &epipe_handler, epipe_signal_callback); /* Set signal handlers to catch kill signals so we can properly clean up * after ourselves. */ - opal_event_set(opal_event_base, &term_handler, SIGTERM, OPAL_EV_SIGNAL, - shutdown_signal, NULL); - opal_event_add(&term_handler, NULL); - opal_event_set(opal_event_base, &int_handler, SIGINT, OPAL_EV_SIGNAL, - shutdown_signal, NULL); - opal_event_add(&int_handler, NULL); - - /** setup callbacks for signals we should ignore */ - opal_event_signal_set(opal_event_base, &sigusr1_handler, SIGUSR1, - signal_callback, &sigusr1_handler); - opal_event_signal_add(&sigusr1_handler, NULL); - opal_event_signal_set(opal_event_base, &sigusr2_handler, SIGUSR2, - signal_callback, &sigusr2_handler); - opal_event_signal_add(&sigusr2_handler, NULL); -#endif /* __WINDOWS__ */ - - signals_set = true; + setup_sighandler(SIGTERM, &term_handler, shutdown_signal); + setup_sighandler(SIGINT, &int_handler, shutdown_signal); - /* initialize the global list of local children and job data */ - OBJ_CONSTRUCT(&orte_local_children, opal_list_t); - OBJ_CONSTRUCT(&orte_local_jobdata, opal_list_t); + /** setup callbacks for signals we should ignore */ + setup_sighandler(SIGUSR1, &sigusr1_handler, signal_callback); + setup_sighandler(SIGUSR2, &sigusr2_handler, signal_callback); + setup_sighandler(SIGTERM, &term_handler, shutdown_signal); +#endif /* __WINDOWS__ */ + + signals_set = true; #if OPAL_HAVE_HWLOC { hwloc_obj_t obj; unsigned i, j; - + /* get the local topology */ if (NULL == opal_hwloc_topology) { if (OPAL_SUCCESS != opal_hwloc_base_get_topology()) { @@ -135,7 +138,7 @@ int orte_ess_base_orted_setup(char **hosts) goto error; } } - + /* remove the hostname from the topology. Unfortunately, hwloc * decided to add the source hostname to the "topology", thus * rendering it unusable as a pure topological description. So @@ -160,14 +163,14 @@ int orte_ess_base_orted_setup(char **hosts) break; } } - + if (4 < opal_output_get_verbosity(orte_ess_base_output)) { opal_output(0, "%s Topology Info:", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); opal_dss.dump(0, opal_hwloc_topology, OPAL_HWLOC_TOPO); } } #endif - + /* open and setup the opal_pstat framework so we can provide * process stats if requested */ @@ -178,7 +181,19 @@ int orte_ess_base_orted_setup(char **hosts) } if (ORTE_SUCCESS != (ret = opal_pstat_base_select())) { ORTE_ERROR_LOG(ret); - error = "orte_pstat_base_select"; + error = "opal_pstat_base_select"; + goto error; + } + + /* open and setup the state machine */ + if (ORTE_SUCCESS != (ret = orte_state_base_open())) { + ORTE_ERROR_LOG(ret); + error = "orte_state_base_open"; + goto error; + } + if (ORTE_SUCCESS != (ret = orte_state_base_select())) { + ORTE_ERROR_LOG(ret); + error = "orte_state_base_select"; goto error; } @@ -188,9 +203,9 @@ int orte_ess_base_orted_setup(char **hosts) error = "orte_errmgr_base_open"; goto error; } - + /* some environments allow remote launches - e.g., ssh - so - * open the PLM and select something -only- if we are given + * open and select something -only- if we are given * a specific module to use */ mca_base_param_reg_string_name("plm", NULL, @@ -215,7 +230,7 @@ int orte_ess_base_orted_setup(char **hosts) goto error; } } - + /* Setup the communication infrastructure */ /* Runtime Messaging Layer - this opens/selects the OOB as well */ @@ -229,14 +244,14 @@ int orte_ess_base_orted_setup(char **hosts) error = "orte_rml_base_select"; goto error; } - + /* select the errmgr */ if (ORTE_SUCCESS != (ret = orte_errmgr_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_errmgr_base_select"; goto error; } - + /* Routed system */ if (ORTE_SUCCESS != (ret = orte_routed_base_open())) { ORTE_ERROR_LOG(ret); @@ -248,7 +263,7 @@ int orte_ess_base_orted_setup(char **hosts) error = "orte_routed_base_select"; goto error; } - + /* * Group communications */ @@ -282,9 +297,6 @@ int orte_ess_base_orted_setup(char **hosts) goto error; } - /* set the communication function */ - orte_comm = orte_global_comm; - /* initialize the nidmaps */ if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(NULL))) { ORTE_ERROR_LOG(ret); @@ -316,12 +328,8 @@ int orte_ess_base_orted_setup(char **hosts) * to mpirun goes through the tree if static ports were enabled - still * need to do it anyway just to initialize things */ - if (ORTE_SUCCESS != (ret = orte_routed.update_routing_tree(ORTE_PROC_MY_NAME->jobid))) { - ORTE_ERROR_LOG(ret); - error = "failed to update routing tree"; - goto error; - } - + orte_routed.update_routing_plan(); + /* Now provide a chance for the PLM * to perform any module-specific init functions. This * needs to occur AFTER the communications are setup @@ -354,17 +362,17 @@ int orte_ess_base_orted_setup(char **hosts) goto error; } /* Once the session directory location has been established, set - the opal_output env file location to be in the - proc-specific session directory. */ + the opal_output env file location to be in the + proc-specific session directory. */ opal_output_set_output_file_info(orte_process_info.proc_session_dir, "output-", NULL, NULL); - + /* setup stdout/stderr */ if (orte_debug_daemons_file_flag) { /* if we are debugging to a file, then send stdout/stderr to * the orted log file */ - + /* get my jobid */ if (ORTE_SUCCESS != (ret = orte_util_convert_jobid_to_string(&jobidstring, ORTE_PROC_MY_NAME->jobid))) { @@ -372,7 +380,7 @@ int orte_ess_base_orted_setup(char **hosts) error = "convert_jobid"; goto error; } - + /* define a log file name in the session directory */ snprintf(log_file, PATH_MAX, "output-orted-%s-%s.log", jobidstring, orte_process_info.nodename); @@ -381,7 +389,7 @@ int orte_ess_base_orted_setup(char **hosts) orte_process_info.top_session_dir, log_file, NULL); - + fd = open(log_path, O_RDWR|O_CREAT|O_TRUNC, 0640); if (fd < 0) { /* couldn't open the file for some reason, so @@ -398,6 +406,44 @@ int orte_ess_base_orted_setup(char **hosts) } } + /* setup the global job array */ + orte_job_data = OBJ_NEW(opal_pointer_array_t); + if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_job_data, + 1, + ORTE_GLOBAL_ARRAY_MAX_SIZE, + 1))) { + ORTE_ERROR_LOG(ret); + error = "setup job array"; + goto error; + } + + /* Setup the job data object for the daemons */ + /* create and store the job data object */ + jdata = OBJ_NEW(orte_job_t); + jdata->jobid = ORTE_PROC_MY_NAME->jobid; + opal_pointer_array_set_item(orte_job_data, 0, jdata); + + /* every job requires at least one app */ + app = OBJ_NEW(orte_app_context_t); + opal_pointer_array_set_item(jdata->apps, 0, app); + jdata->num_apps++; + + /* create and store a proc object for us */ + proc = OBJ_NEW(orte_proc_t); + proc->name.jobid = ORTE_PROC_MY_NAME->jobid; + proc->name.vpid = ORTE_PROC_MY_NAME->vpid; + + proc->pid = orte_process_info.pid; + proc->rml_uri = orte_rml.get_contact_info(); + proc->state = ORTE_PROC_STATE_RUNNING; + opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc); + + /* record that the daemon job is running */ + jdata->num_procs = 1; + jdata->state = ORTE_JOB_STATE_RUNNING; + /* obviously, we have "reported" */ + jdata->num_reported = 1; + /* setup the routed info - the selected routed component * will know what to do. */ @@ -485,15 +531,15 @@ int orte_ess_base_orted_setup(char **hosts) } if (ORTE_SUCCESS != (ret = orte_sensor_base_select())) { ORTE_ERROR_LOG(ret); - error = "ortesensor_select"; + error = "orte_sensor_select"; goto error; } /* start the local sensors */ orte_sensor.start(ORTE_PROC_MY_NAME->jobid); - + return ORTE_SUCCESS; - error: +error: orte_show_help("help-orte-runtime.txt", "orte_init:startup:internal-failure", true, error, ORTE_ERROR_NAME(ret), ret); @@ -505,7 +551,7 @@ int orte_ess_base_orted_finalize(void) { /* stop the local sensors */ orte_sensor.stop(ORTE_PROC_MY_NAME->jobid); - + if (signals_set) { /* Release all local signal handlers */ opal_event_del(&epipe_handler); @@ -516,7 +562,7 @@ int orte_ess_base_orted_finalize(void) opal_event_signal_del(&sigusr2_handler); #endif /* __WINDOWS__ */ } - + /* cleanup */ if (NULL != log_path) { unlink(log_path); @@ -525,49 +571,9 @@ int orte_ess_base_orted_finalize(void) /* make sure our local procs are dead */ orte_odls.kill_local_procs(NULL); - /* whack any lingering session directory files from our jobs */ - orte_session_dir_cleanup(ORTE_JOBID_WILDCARD); - - orte_sensor_base_close(); - orte_notifier_base_close(); - - orte_cr_finalize(); - -#if OPAL_ENABLE_FT_CR == 1 - orte_snapc_base_close(); -#endif - orte_filem_base_close(); - - orte_odls_base_close(); - - orte_wait_finalize(); - orte_iof_base_close(); - - /* finalize selected modules */ - if (plm_in_use) { - orte_plm_base_close(); - } - - orte_errmgr_base_close(); - - /* now can close the rml and its friendly group comm */ - orte_grpcomm_base_close(); - orte_routed_base_close(); - orte_rml_base_close(); - /* cleanup any lingering session directories */ orte_session_dir_cleanup(ORTE_JOBID_WILDCARD); - - /* handle the orted-specific OPAL stuff */ - opal_pstat_base_close(); -#if OPAL_HAVE_HWLOC - /* destroy the topology, if required */ - if (NULL != opal_hwloc_topology) { - opal_hwloc_base_free_topology(opal_hwloc_topology); - opal_hwloc_topology = NULL; - } -#endif - + return ORTE_SUCCESS; } @@ -578,7 +584,7 @@ static void shutdown_signal(int fd, short flags, void *arg) * check the one-time lock */ ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); - orte_quit(); + ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_FORCED_EXIT); } /** diff --git a/orte/mca/ess/base/ess_base_std_tool.c b/orte/mca/ess/base/ess_base_std_tool.c index 5c77d114ba..a00ed440a0 100644 --- a/orte/mca/ess/base/ess_base_std_tool.c +++ b/orte/mca/ess/base/ess_base_std_tool.c @@ -9,6 +9,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow diff --git a/orte/mca/ess/cnos/ess_cnos_module.c b/orte/mca/ess/cnos/ess_cnos_module.c index 6f553efc15..dc2d815248 100644 --- a/orte/mca/ess/cnos/ess_cnos_module.c +++ b/orte/mca/ess/cnos/ess_cnos_module.c @@ -58,7 +58,6 @@ orte_ess_base_module_t orte_ess_cnos_module = { proc_get_hostname, proc_get_local_rank, proc_get_node_rank, - orte_ess_base_proc_get_epoch, /* get_epoch */ NULL, /* add_pidmap is only used in ORTE */ NULL, /* update_nidmap is only used in ORTE */ NULL /* ft_event */ diff --git a/orte/mca/ess/env/ess_env_module.c b/orte/mca/ess/env/ess_env_module.c index 3c37ef29f7..26c7183713 100644 --- a/orte/mca/ess/env/ess_env_module.c +++ b/orte/mca/ess/env/ess_env_module.c @@ -92,7 +92,6 @@ orte_ess_base_module_t orte_ess_env_module = { orte_ess_base_proc_get_hostname, orte_ess_base_proc_get_local_rank, orte_ess_base_proc_get_node_rank, - orte_ess_base_proc_get_epoch, /* proc_get_epoch */ orte_ess_base_update_pidmap, orte_ess_base_update_nidmap, #if OPAL_ENABLE_FT_CR == 1 @@ -241,7 +240,6 @@ static int env_set_name(void) ORTE_PROC_MY_NAME->jobid = jobid; ORTE_PROC_MY_NAME->vpid = vpid; - ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME)); OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, "ess:env set name to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); diff --git a/orte/mca/ess/ess.h b/orte/mca/ess/ess.h index ce7156f775..915d97ee39 100644 --- a/orte/mca/ess/ess.h +++ b/orte/mca/ess/ess.h @@ -9,6 +9,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -59,8 +61,7 @@ typedef int (*orte_ess_base_module_finalize_fn_t)(void); * function should create an appropriate file to alert the local * orted that termination was abnormal. */ -typedef void (*orte_ess_base_module_abort_fn_t)(int status, bool report) - __opal_attribute_noreturn_funcptr__; +typedef void (*orte_ess_base_module_abort_fn_t)(int status, bool report); /** * Get the locality flag of the specified process @@ -105,19 +106,6 @@ typedef orte_local_rank_t (*orte_ess_base_module_proc_get_local_rank_fn_t)(orte_ */ typedef orte_node_rank_t (*orte_ess_base_module_proc_get_node_rank_fn_t)(orte_process_name_t *proc); -/** - * Update the epoch - * - * The epochs of the processes are stored in the process_name struct, but this - * will get the most up to date version stored within the orte_proc_t struct. - * Obviously the epoch of the proc that is passed in will be ignored. - */ -#if ORTE_ENABLE_EPOCH -typedef orte_epoch_t (*orte_ess_base_module_proc_get_epoch_fn_t)(orte_process_name_t *proc); -#else -typedef int (*orte_ess_base_module_proc_get_epoch_fn_t)(orte_process_name_t *proc); -#endif - /** * Update the pidmap * @@ -159,7 +147,6 @@ struct orte_ess_base_module_1_0_0_t { orte_ess_base_module_proc_get_hostname_fn_t proc_get_hostname; orte_ess_base_module_proc_get_local_rank_fn_t get_local_rank; orte_ess_base_module_proc_get_node_rank_fn_t get_node_rank; - orte_ess_base_module_proc_get_epoch_fn_t proc_get_epoch; orte_ess_base_module_update_pidmap_fn_t update_pidmap; orte_ess_base_module_update_nidmap_fn_t update_nidmap; orte_ess_base_module_ft_event_fn_t ft_event; diff --git a/orte/mca/ess/generic/ess_generic_module.c b/orte/mca/ess/generic/ess_generic_module.c index cd9a0afa83..eb2c3a684a 100644 --- a/orte/mca/ess/generic/ess_generic_module.c +++ b/orte/mca/ess/generic/ess_generic_module.c @@ -86,7 +86,6 @@ orte_ess_base_module_t orte_ess_generic_module = { orte_ess_base_proc_get_hostname, orte_ess_base_proc_get_local_rank, orte_ess_base_proc_get_node_rank, - orte_ess_base_proc_get_epoch, orte_ess_base_update_pidmap, orte_ess_base_update_nidmap, NULL @@ -145,7 +144,6 @@ static int rte_init(void) goto error; } ORTE_PROC_MY_NAME->vpid = strtol(envar, NULL, 10); - ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,ORTE_EPOCH_MIN); OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, "%s completed name definition", @@ -263,7 +261,6 @@ static int rte_init(void) if (vpid == ORTE_PROC_MY_NAME->vpid) { ORTE_PROC_MY_DAEMON->jobid = 0; ORTE_PROC_MY_DAEMON->vpid = i; - ORTE_EPOCH_SET(ORTE_PROC_MY_DAEMON->epoch,ORTE_PROC_MY_NAME->epoch); } OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, "%s node %d name %s rank %s", @@ -294,7 +291,6 @@ static int rte_init(void) if (vpid == ORTE_PROC_MY_NAME->vpid) { ORTE_PROC_MY_DAEMON->jobid = 0; ORTE_PROC_MY_DAEMON->vpid = i; - ORTE_EPOCH_SET(ORTE_PROC_MY_DAEMON->epoch,ORTE_PROC_MY_NAME->epoch); } OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, "%s node %d name %s rank %d", diff --git a/orte/mca/ess/hnp/ess_hnp_module.c b/orte/mca/ess/hnp/ess_hnp_module.c index 5e6b79c5c6..17c4bd9272 100644 --- a/orte/mca/ess/hnp/ess_hnp_module.c +++ b/orte/mca/ess/hnp/ess_hnp_module.c @@ -11,6 +11,8 @@ * All rights reserved. * Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved. * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -63,6 +65,8 @@ #include "orte/mca/snapc/base/base.h" #endif #include "orte/mca/filem/base/base.h" +#include "orte/mca/state/base/base.h" +#include "orte/mca/state/state.h" #include "orte/util/show_help.h" #include "orte/util/proc_info.h" @@ -105,7 +109,6 @@ orte_ess_base_module_t orte_ess_hnp_module = { proc_get_hostname, proc_get_local_rank, proc_get_node_rank, - orte_ess_base_proc_get_epoch, /* proc_get_epoch */ update_pidmap, update_nidmap, NULL /* ft_event */ @@ -113,6 +116,7 @@ orte_ess_base_module_t orte_ess_hnp_module = { /* local globals */ static bool signals_set=false; +static bool forcibly_die=false; static opal_event_t term_handler; static opal_event_t int_handler; static opal_event_t epipe_handler; @@ -124,10 +128,17 @@ static opal_event_t sigcont_handler; #endif /* __WINDOWS__ */ static void abort_signal_callback(int fd, short flags, void *arg); -static void abort_exit_callback(int fd, short event, void *arg); static void epipe_signal_callback(int fd, short flags, void *arg); static void signal_forward_callback(int fd, short event, void *arg); +static void setup_sighandler(int signal, opal_event_t *ev, + opal_event_cbfunc_t cbfunc) +{ + opal_event_signal_set(orte_event_base, ev, signal, cbfunc, ev); + opal_event_set_priority(ev, ORTE_ERROR_PRI); + opal_event_signal_add(ev, NULL); +} + static int rte_init(void) { int ret; @@ -138,47 +149,31 @@ static int rte_init(void) orte_proc_t *proc; orte_app_context_t *app; + /* run the prolog */ if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) { error = "orte_ess_base_std_prolog"; goto error; } - + #ifndef __WINDOWS__ /* setup callback for SIGPIPE */ - opal_event_signal_set(opal_event_base, &epipe_handler, SIGPIPE, - epipe_signal_callback, &epipe_handler); - opal_event_signal_add(&epipe_handler, NULL); + setup_sighandler(SIGPIPE, &epipe_handler, epipe_signal_callback); /** setup callbacks for abort signals - from this point * forward, we need to abort in a manner that allows us * to cleanup */ - opal_event_signal_set(opal_event_base, &term_handler, SIGTERM, - abort_signal_callback, &term_handler); - opal_event_signal_add(&term_handler, NULL); - opal_event_signal_set(opal_event_base, &int_handler, SIGINT, - abort_signal_callback, &int_handler); - opal_event_signal_add(&int_handler, NULL); + setup_sighandler(SIGTERM, &term_handler, abort_signal_callback); + setup_sighandler(SIGINT, &int_handler, abort_signal_callback); /** setup callbacks for signals we should foward */ - opal_event_signal_set(opal_event_base, &sigusr1_handler, SIGUSR1, - signal_forward_callback, &sigusr1_handler); - opal_event_signal_add(&sigusr1_handler, NULL); - opal_event_signal_set(opal_event_base, &sigusr2_handler, SIGUSR2, - signal_forward_callback, &sigusr2_handler); - opal_event_signal_add(&sigusr2_handler, NULL); - if (orte_forward_job_control) { - opal_event_signal_set(opal_event_base, &sigtstp_handler, SIGTSTP, - signal_forward_callback, &sigtstp_handler); - opal_event_signal_add(&sigtstp_handler, NULL); - opal_event_signal_set(opal_event_base, &sigcont_handler, SIGCONT, - signal_forward_callback, &sigcont_handler); - opal_event_signal_add(&sigcont_handler, NULL); - } + setup_sighandler(SIGUSR1, &sigusr1_handler, signal_forward_callback); + setup_sighandler(SIGUSR2, &sigusr2_handler, signal_forward_callback); + setup_sighandler(SIGTSTP, &sigtstp_handler, signal_forward_callback); + setup_sighandler(SIGCONT, &sigcont_handler, signal_forward_callback); #endif /* __WINDOWS__ */ - signals_set = true; - + #if OPAL_HAVE_HWLOC { hwloc_obj_t obj; @@ -240,10 +235,22 @@ static int rte_init(void) } if (ORTE_SUCCESS != (ret = opal_pstat_base_select())) { ORTE_ERROR_LOG(ret); - error = "orte_pstat_base_select"; + error = "opal_pstat_base_select"; goto error; } - + + /* open and setup the state machine */ + if (ORTE_SUCCESS != (ret = orte_state_base_open())) { + ORTE_ERROR_LOG(ret); + error = "orte_state_base_open"; + goto error; + } + if (ORTE_SUCCESS != (ret = orte_state_base_select())) { + ORTE_ERROR_LOG(ret); + error = "orte_state_base_select"; + goto error; + } + if (ORTE_SUCCESS != (ret = orte_errmgr_base_open())) { error = "orte_errmgr_base_open"; goto error; @@ -270,7 +277,7 @@ static int rte_init(void) error = "orte_plm_set_hnp_name"; goto error; } - + /* Setup the communication infrastructure */ /* @@ -381,9 +388,6 @@ static int rte_init(void) goto error; } - /* set the communication function */ - orte_comm = orte_global_comm; - /* we are an hnp, so update the contact info field for later use */ orte_process_info.my_hnp_uri = orte_rml.get_contact_info(); @@ -393,7 +397,7 @@ static int rte_init(void) #if !ORTE_DISABLE_FULL_SUPPORT /* setup the orte_show_help system to recv remote output */ ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_SHOW_HELP, - ORTE_RML_NON_PERSISTENT, orte_show_help_recv, NULL); + ORTE_RML_PERSISTENT, orte_show_help_recv, NULL); if (ret != ORTE_SUCCESS && ret != ORTE_ERR_NOT_IMPLEMENTED) { ORTE_ERROR_LOG(ret); error = "setup receive for orte_show_help"; @@ -476,6 +480,7 @@ static int rte_init(void) error = "setup node topologies array"; goto error; } + /* Setup the job data object for the daemons */ /* create and store the job data object */ jdata = OBJ_NEW(orte_job_t); @@ -502,7 +507,6 @@ static int rte_init(void) proc = OBJ_NEW(orte_proc_t); proc->name.jobid = ORTE_PROC_MY_NAME->jobid; proc->name.vpid = ORTE_PROC_MY_NAME->vpid; - ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN); proc->pid = orte_process_info.pid; proc->rml_uri = orte_rml.get_contact_info(); @@ -526,7 +530,9 @@ static int rte_init(void) /* record that the daemon job is running */ jdata->num_procs = 1; jdata->state = ORTE_JOB_STATE_RUNNING; - + /* obviously, we have "reported" */ + jdata->num_reported = 1; + /* setup the routed info - the selected routed component * will know what to do. */ @@ -649,7 +655,7 @@ static int rte_init(void) problematic in some scenarios (e.g., COMM_SPAWN, BTL's that require OOB messages for wireup, etc.). */ opal_progress_set_yield_when_idle(false); - + return ORTE_SUCCESS; error: @@ -665,9 +671,6 @@ error: static int rte_finalize(void) { char *contact_path; - orte_node_t *node; - orte_job_t *job; - int i; if (signals_set) { /* Remove the epipe handler */ @@ -696,69 +699,12 @@ static int rte_finalize(void) unlink(contact_path); free(contact_path); - orte_sensor_base_close(); - orte_notifier_base_close(); - - orte_cr_finalize(); - -#if OPAL_ENABLE_FT_CR == 1 - orte_snapc_base_close(); -#endif - orte_filem_base_close(); - - orte_odls_base_close(); - - orte_wait_finalize(); + /* output any lingering stdout/err data */ orte_iof_base_close(); - - /* finalize selected modules so they can de-register - * any receives - */ - orte_ras_base_close(); - orte_rmaps_base_close(); - orte_plm_base_close(); - orte_errmgr_base_close(); - orte_grpcomm_base_close(); - - /* now can close the rml */ - orte_routed_base_close(); - orte_rml_base_close(); - - /* if we were doing timing studies, close the timing file */ - if (orte_timing) { - if (stdout != orte_timing_output && - stderr != orte_timing_output) { - fclose(orte_timing_output); - } - } - - /* cleanup the job and node info arrays */ - if (NULL != orte_node_pool) { - for (i=0; i < orte_node_pool->size; i++) { - if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool,i))) { - OBJ_RELEASE(node); - } - } - OBJ_RELEASE(orte_node_pool); - } - if (NULL != orte_job_data) { - for (i=0; i < orte_job_data->size; i++) { - if (NULL != (job = (orte_job_t*)opal_pointer_array_get_item(orte_job_data,i))) { - OBJ_RELEASE(job); - } - } - OBJ_RELEASE(orte_job_data); - } /* finalize the session directory tree */ orte_session_dir_finalize(ORTE_PROC_MY_NAME); - /* clean out the global structures */ - orte_proc_info_finalize(); - if (NULL != orte_job_ident) { - free(orte_job_ident); - } - /* close the xml output file, if open */ if (orte_xml_output) { fprintf(orte_xml_fp, "\n"); @@ -767,16 +713,6 @@ static int rte_finalize(void) fclose(orte_xml_fp); } } - - /* handle the orted-specific OPAL stuff */ - opal_pstat_base_close(); -#if OPAL_HAVE_HWLOC - /* destroy the topology, if required */ - if (NULL != opal_hwloc_topology) { - opal_hwloc_base_free_topology(opal_hwloc_topology); - opal_hwloc_topology = NULL; - } -#endif return ORTE_SUCCESS; } @@ -962,51 +898,6 @@ static int update_nidmap(opal_byte_object_t *bo) return ORTE_SUCCESS; } -static bool forcibly_die=false; - -static void abort_exit_callback(int fd, short ign, void *arg) -{ - int ret; - - fprintf(stderr, "%s: killing job...\n\n", orte_basename); - - /* since we are being terminated by a user's signal, be - * sure to exit with a non-zero exit code - but don't - * overwrite any error code from a proc that might have - * failed, in case that is why the user ordered us - * to terminate - */ - ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); - - /* terminate the job - this will also wakeup orterun so - * it can report to the user and kill all the orteds. - * Check the jobid, though, just in case the user - * hit ctrl-c before we had a chance to setup the - * job in the system - in which case there is nothing - * to terminate! - */ - if (!orte_never_launched) { - /* - * Turn off the process recovery functionality, if it was enabled. - * This keeps the errmgr from trying to recover from the shutdown - * procedure. - */ - orte_enable_recovery = false; - - /* terminate the orteds - they will automatically kill - * their local procs - */ - ret = orte_plm.terminate_orteds(); - - } else { - /* if the jobid is invalid or we never launched, - * there is nothing to do but just clean ourselves - * up and exit - */ - orte_quit(); - } -} - /* * Attempt to terminate the job and wait for callback indicating * the job has been aborted. @@ -1047,12 +938,17 @@ static void abort_signal_callback(int fd, short flags, void *arg) */ orte_execute_quiet = true; + if (!orte_never_launched) { + /* cleanup our data server */ + orte_data_server_finalize(); + } + /* We are in an event handler; the job completed procedure will delete the signal handler that is currently running (which is a Bad Thing), so we can't call it directly. Instead, we have to exit this handler and setup to call job_completed() after this. */ - ORTE_TIMER_EVENT(0, 0, abort_exit_callback); + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); } /** @@ -1066,7 +962,7 @@ static void epipe_signal_callback(int fd, short flags, void *arg) if (10 < sigpipe_error_count) { /* time to abort */ opal_output(0, "%s: SIGPIPE detected on fd %d - aborting", orte_basename, fd); - abort_exit_callback(0, 0, 0); + abort_signal_callback(0, 0, NULL); } return; diff --git a/orte/mca/ess/lsf/ess_lsf_module.c b/orte/mca/ess/lsf/ess_lsf_module.c index f7f7a9a52b..7d96f125b4 100644 --- a/orte/mca/ess/lsf/ess_lsf_module.c +++ b/orte/mca/ess/lsf/ess_lsf_module.c @@ -60,7 +60,6 @@ orte_ess_base_module_t orte_ess_lsf_module = { orte_ess_base_proc_get_hostname, orte_ess_base_proc_get_local_rank, orte_ess_base_proc_get_node_rank, - orte_ess_base_proc_get_epoch, /* proc_get_epoch */ orte_ess_base_update_pidmap, orte_ess_base_update_nidmap, NULL /* ft_event */ @@ -215,8 +214,6 @@ static int lsf_set_name(void) lsf_nodeid = atoi(getenv("LSF_PM_TASKID")); ORTE_PROC_MY_NAME->vpid = vpid + lsf_nodeid; - ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME)); - /* get the non-name common environmental variables */ if (ORTE_SUCCESS != (rc = orte_ess_env_get())) { ORTE_ERROR_LOG(rc); diff --git a/orte/mca/ess/pmi/ess_pmi_component.c b/orte/mca/ess/pmi/ess_pmi_component.c index 2c66fef654..0ae93953d7 100644 --- a/orte/mca/ess/pmi/ess_pmi_component.c +++ b/orte/mca/ess/pmi/ess_pmi_component.c @@ -106,7 +106,7 @@ static int pmi_component_query(mca_base_module_t **module, int *priority) /* we are available anywhere PMI is available, but not for HNP itself */ if (!ORTE_PROC_IS_HNP && pmi_startup()) { /* if PMI is available, use it */ - *priority = 40; + *priority = 35; *module = (mca_base_module_t *)&orte_ess_pmi_module; return ORTE_SUCCESS; } diff --git a/orte/mca/ess/pmi/ess_pmi_module.c b/orte/mca/ess/pmi/ess_pmi_module.c index 3934b04cec..8e5dea8f00 100644 --- a/orte/mca/ess/pmi/ess_pmi_module.c +++ b/orte/mca/ess/pmi/ess_pmi_module.c @@ -63,7 +63,7 @@ static int rte_init(void); static int rte_finalize(void); -static void rte_abort(int error_code, bool report) __opal_attribute_noreturn__; +static void rte_abort(int error_code, bool report); orte_ess_base_module_t orte_ess_pmi_module = { rte_init, @@ -74,7 +74,6 @@ orte_ess_base_module_t orte_ess_pmi_module = { orte_ess_base_proc_get_hostname, orte_ess_base_proc_get_local_rank, orte_ess_base_proc_get_node_rank, - orte_ess_base_proc_get_epoch, /* proc_get_epoch */ orte_ess_base_update_pidmap, orte_ess_base_update_nidmap, NULL /* ft_event */ @@ -223,13 +222,10 @@ static int rte_init(void) free(cs_env); free(string_key); - /* get our app_context number */ - if (PMI_SUCCESS != (ret = PMI_Get_appnum(&i))) { - ORTE_PMI_ERROR(ret, "PMI_Get_appnum"); - error = "could not get PMI appnum"; - goto error; - } - orte_process_info.app_num = i; + /* our app_context number can only be 0 as we don't support + * dynamic spawns + */ + orte_process_info.app_num = 0; /* setup the nidmap arrays - they will be filled by the modex */ if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(NULL))) { @@ -307,9 +303,6 @@ static int rte_init(void) } } - /* complete definition of process name */ - ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,ORTE_EPOCH_MIN); - /* set max procs */ if (orte_process_info.max_procs < orte_process_info.num_procs) { orte_process_info.max_procs = orte_process_info.num_procs; diff --git a/orte/mca/ess/portals4_shmem/ess_portals4_shmem_module.c b/orte/mca/ess/portals4_shmem/ess_portals4_shmem_module.c index 4245fe1a02..6dd52360ce 100644 --- a/orte/mca/ess/portals4_shmem/ess_portals4_shmem_module.c +++ b/orte/mca/ess/portals4_shmem/ess_portals4_shmem_module.c @@ -56,7 +56,6 @@ orte_ess_base_module_t orte_ess_portals4_shmem_module = { proc_get_hostname, proc_get_local_rank, proc_get_node_rank, - orte_ess_base_proc_get_epoch, /* proc_get_epoch */ NULL, /* add_pidmap is only used in ORTE */ NULL, /* update_nidmap is only used in ORTE */ NULL /* ft_event */ diff --git a/orte/mca/ess/singleton/ess_singleton_module.c b/orte/mca/ess/singleton/ess_singleton_module.c index 437036d26a..605bded8b1 100644 --- a/orte/mca/ess/singleton/ess_singleton_module.c +++ b/orte/mca/ess/singleton/ess_singleton_module.c @@ -79,7 +79,6 @@ orte_ess_base_module_t orte_ess_singleton_module = { orte_ess_base_proc_get_hostname, orte_ess_base_proc_get_local_rank, orte_ess_base_proc_get_node_rank, - orte_ess_base_proc_get_epoch, /* proc_get_epoch */ orte_ess_base_update_pidmap, orte_ess_base_update_nidmap, NULL /* ft_event */ @@ -178,7 +177,6 @@ static int rte_init(void) /* set the name */ ORTE_PROC_MY_NAME->jobid = 0xffff0000 & ((uint32_t)jobfam << 16); ORTE_PROC_MY_NAME->vpid = 0; - ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,ORTE_EPOCH_MIN); } else { /* diff --git a/orte/mca/ess/slurm/ess_slurm_module.c b/orte/mca/ess/slurm/ess_slurm_module.c index 587d0cf713..74c242a71c 100644 --- a/orte/mca/ess/slurm/ess_slurm_module.c +++ b/orte/mca/ess/slurm/ess_slurm_module.c @@ -63,7 +63,6 @@ orte_ess_base_module_t orte_ess_slurm_module = { orte_ess_base_proc_get_hostname, orte_ess_base_proc_get_local_rank, orte_ess_base_proc_get_node_rank, - orte_ess_base_proc_get_epoch, /* proc_get_epoch */ orte_ess_base_update_pidmap, orte_ess_base_update_nidmap, NULL /* ft_event */ @@ -193,7 +192,6 @@ static int slurm_set_name(void) /* fix up the vpid and make it the "real" vpid */ slurm_nodeid = atoi(getenv("SLURM_NODEID")); ORTE_PROC_MY_NAME->vpid = vpid + slurm_nodeid; - ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME)); OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, "ess:slurm set name to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); diff --git a/orte/mca/ess/slurmd/ess_slurmd_module.c b/orte/mca/ess/slurmd/ess_slurmd_module.c index 84ed39175e..90f59e0dba 100644 --- a/orte/mca/ess/slurmd/ess_slurmd_module.c +++ b/orte/mca/ess/slurmd/ess_slurmd_module.c @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2008-2011 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -59,7 +61,7 @@ static int rte_init(void); static int rte_finalize(void); -static void rte_abort(int error_code, bool report) __opal_attribute_noreturn__; +static void rte_abort(int error_code, bool report); orte_ess_base_module_t orte_ess_slurmd_module = { rte_init, @@ -70,7 +72,6 @@ orte_ess_base_module_t orte_ess_slurmd_module = { orte_ess_base_proc_get_hostname, orte_ess_base_proc_get_local_rank, orte_ess_base_proc_get_node_rank, - orte_ess_base_proc_get_epoch, /* proc_get_epoch */ orte_ess_base_update_pidmap, orte_ess_base_update_nidmap, NULL /* ft_event */ @@ -185,7 +186,6 @@ static int rte_init(void) nodeid = strtol(envar, NULL, 10); ORTE_PROC_MY_DAEMON->jobid = 0; ORTE_PROC_MY_DAEMON->vpid = nodeid; - ORTE_EPOCH_SET(ORTE_PROC_MY_DAEMON->epoch,ORTE_PROC_MY_NAME->epoch); /* get the node list */ if (NULL == (regexp = getenv("SLURM_STEP_NODELIST"))) { @@ -370,9 +370,6 @@ static int rte_init(void) putenv("OMPI_MCA_grpcomm=hier"); putenv("OMPI_MCA_routed=direct"); - /* complete definition of process name */ - ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,ORTE_EPOCH_MIN); - /* get our local rank */ if (NULL == (envar = getenv("SLURM_LOCALID"))) { error = "could not get SLURM_LOCALID"; diff --git a/orte/mca/ess/tm/ess_tm_module.c b/orte/mca/ess/tm/ess_tm_module.c index 6d149f6962..21d14a81b8 100644 --- a/orte/mca/ess/tm/ess_tm_module.c +++ b/orte/mca/ess/tm/ess_tm_module.c @@ -62,7 +62,6 @@ orte_ess_base_module_t orte_ess_tm_module = { orte_ess_base_proc_get_hostname, orte_ess_base_proc_get_local_rank, orte_ess_base_proc_get_node_rank, - orte_ess_base_proc_get_epoch, /* proc_get_epoch */ orte_ess_base_update_pidmap, orte_ess_base_update_nidmap, NULL /* ft_event */ @@ -217,7 +216,6 @@ static int tm_set_name(void) ORTE_PROC_MY_NAME->jobid = jobid; ORTE_PROC_MY_NAME->vpid = vpid; - ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME)); OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, "ess:tm set name to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); diff --git a/orte/mca/ess/tool/ess_tool_module.c b/orte/mca/ess/tool/ess_tool_module.c index 3884990e7f..f9ec730521 100644 --- a/orte/mca/ess/tool/ess_tool_module.c +++ b/orte/mca/ess/tool/ess_tool_module.c @@ -55,7 +55,6 @@ orte_ess_base_module_t orte_ess_tool_module = { NULL, /* don't need a proc_get_hostname fn */ NULL, /* don't need a proc_get_local_rank fn */ NULL, /* don't need a proc_get_node_rank fn */ - orte_ess_base_proc_get_epoch, /* proc_get_epoch */ NULL, /* don't need to update_pidmap */ NULL, /* don't need to update_nidmap */ NULL /* ft_event */ diff --git a/orte/mca/filem/base/filem_base_receive.c b/orte/mca/filem/base/filem_base_receive.c index 93e4b126b3..658ce07095 100644 --- a/orte/mca/filem/base/filem_base_receive.c +++ b/orte/mca/filem/base/filem_base_receive.c @@ -10,6 +10,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -49,6 +51,7 @@ #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/rml/rml.h" #include "orte/mca/rml/rml_types.h" +#include "orte/mca/state/state.h" #include "orte/util/name_fns.h" #include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_quit.h" @@ -184,6 +187,7 @@ static void filem_base_process_get_proc_node_name_cmd(orte_process_name_t* sende count = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &name, &count, ORTE_NAME))) { ORTE_ERROR_LOG(rc); + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); goto CLEANUP; } @@ -193,16 +197,14 @@ static void filem_base_process_get_proc_node_name_cmd(orte_process_name_t* sende /* get the job data object for this proc */ if (NULL == (jdata = orte_get_job_data_object(name.jobid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - ORTE_UPDATE_EXIT_STATUS(1); - orte_jobs_complete(); + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); goto CLEANUP; } /* get the proc object for it */ procs = (orte_proc_t**)jdata->procs->addr; if (NULL == procs[name.vpid] || NULL == procs[name.vpid]->node) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - ORTE_UPDATE_EXIT_STATUS(1); - orte_jobs_complete(); + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); goto CLEANUP; } @@ -211,13 +213,13 @@ static void filem_base_process_get_proc_node_name_cmd(orte_process_name_t* sende */ if (ORTE_SUCCESS != (rc = opal_dss.pack(&answer, &(procs[name.vpid]->node->name), 1, OPAL_STRING))) { ORTE_ERROR_LOG(rc); - ORTE_UPDATE_EXIT_STATUS(1); - orte_jobs_complete(); + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); goto CLEANUP; } if (0 > (rc = orte_rml.send_buffer(sender, &answer, ORTE_RML_TAG_FILEM_BASE_RESP, 0))) { ORTE_ERROR_LOG(rc); + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); } CLEANUP: @@ -251,6 +253,7 @@ static void filem_base_process_get_remote_path_cmd(orte_process_name_t* sender, count = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &filename, &count, OPAL_STRING))) { ORTE_ERROR_LOG(rc); + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); goto CLEANUP; } @@ -297,19 +300,18 @@ static void filem_base_process_get_remote_path_cmd(orte_process_name_t* sender, */ if (ORTE_SUCCESS != (rc = opal_dss.pack(&answer, &tmp_name, 1, OPAL_STRING))) { ORTE_ERROR_LOG(rc); - ORTE_UPDATE_EXIT_STATUS(1); - orte_jobs_complete(); + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); goto CLEANUP; } if (ORTE_SUCCESS != (rc = opal_dss.pack(&answer, &file_type, 1, OPAL_INT))) { ORTE_ERROR_LOG(rc); - ORTE_UPDATE_EXIT_STATUS(1); - orte_jobs_complete(); + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); goto CLEANUP; } if (0 > (rc = orte_rml.send_buffer(sender, &answer, ORTE_RML_TAG_FILEM_BASE_RESP, 0))) { ORTE_ERROR_LOG(rc); + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); } CLEANUP: diff --git a/orte/mca/filem/rsh/filem_rsh_module.c b/orte/mca/filem/rsh/filem_rsh_module.c index c801e4b7b5..fa7fcadc7a 100644 --- a/orte/mca/filem/rsh/filem_rsh_module.c +++ b/orte/mca/filem/rsh/filem_rsh_module.c @@ -1096,11 +1096,9 @@ static int orte_filem_rsh_start_command(orte_filem_base_process_set_t *proc_set if( NULL != proc_set ) { wp_item->proc_set.source.jobid = proc_set->source.jobid; wp_item->proc_set.source.vpid = proc_set->source.vpid; - ORTE_EPOCH_SET(wp_item->proc_set.source.epoch,proc_set->source.epoch); wp_item->proc_set.sink.jobid = proc_set->sink.jobid; wp_item->proc_set.sink.vpid = proc_set->sink.vpid; - ORTE_EPOCH_SET(wp_item->proc_set.sink.epoch,proc_set->sink.epoch); } /* Copy the File Set */ if( NULL != file_set ) { @@ -1395,7 +1393,6 @@ static void orte_filem_rsh_permission_callback(int status, wp_item = OBJ_NEW(orte_filem_rsh_work_pool_item_t); wp_item->proc_set.source.jobid = sender->jobid; wp_item->proc_set.source.vpid = sender->vpid; - ORTE_EPOCH_SET(wp_item->proc_set.source.epoch,sender->epoch); opal_list_append(&work_pool_waiting, &(wp_item->super)); } diff --git a/orte/mca/grpcomm/bad/grpcomm_bad_module.c b/orte/mca/grpcomm/bad/grpcomm_bad_module.c index c6661539b5..9d3ea66c62 100644 --- a/orte/mca/grpcomm/bad/grpcomm_bad_module.c +++ b/orte/mca/grpcomm/bad/grpcomm_bad_module.c @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -50,9 +52,8 @@ static void finalize(void); static int xcast(orte_jobid_t job, opal_buffer_t *buffer, orte_rml_tag_t tag); -static int bad_allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf); -static int bad_barrier(void); -static int modex(opal_list_t *procs); +static int bad_allgather(orte_grpcomm_collective_t *coll); +static int bad_barrier(orte_grpcomm_collective_t *coll); /* Module def */ orte_grpcomm_base_module_t orte_grpcomm_bad_module = { @@ -60,17 +61,13 @@ orte_grpcomm_base_module_t orte_grpcomm_bad_module = { finalize, xcast, bad_allgather, - orte_grpcomm_base_allgather_list, bad_barrier, orte_grpcomm_base_set_proc_attr, orte_grpcomm_base_get_proc_attr, - modex, + orte_grpcomm_base_modex, orte_grpcomm_base_purge_proc_attrs }; -/* Local variables */ -static orte_grpcomm_collective_t barrier, allgather; - /** * Initialize the module */ @@ -83,21 +80,9 @@ static int init(void) return rc; } - /* setup global variables */ - OBJ_CONSTRUCT(&barrier, orte_grpcomm_collective_t); - OBJ_CONSTRUCT(&allgather, orte_grpcomm_collective_t); - - /* if we are a daemon or the hnp, we need to post a - * recv to catch any collective operations - */ - if (ORTE_PROC_IS_DAEMON || ORTE_PROC_IS_HNP) { - if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, - ORTE_RML_TAG_DAEMON_COLLECTIVE, - ORTE_RML_NON_PERSISTENT, - orte_grpcomm_base_daemon_coll_recv, - NULL))) { - ORTE_ERROR_LOG(rc); - } + /* setup recvs */ + if (ORTE_SUCCESS != (rc = orte_grpcomm_base_comm_start())) { + ORTE_ERROR_LOG(rc); } return rc; @@ -110,16 +95,8 @@ static void finalize(void) { orte_grpcomm_base_modex_finalize(); - /* destruct the globals */ - OBJ_DESTRUCT(&barrier); - OBJ_DESTRUCT(&allgather); - - /* if we are a daemon or the hnp, we need to cancel the - * recv we posted - */ - if (ORTE_PROC_IS_DAEMON || ORTE_PROC_IS_HNP) { - orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON_COLLECTIVE); - } + /* cancel recv */ + orte_grpcomm_base_comm_stop(); } /** @@ -133,7 +110,7 @@ static int xcast(orte_jobid_t job, orte_rml_tag_t tag) { int rc = ORTE_SUCCESS; - opal_buffer_t buf; + opal_buffer_t *buf; OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output, "%s grpcomm:bad:xcast sent to job %s tag %ld", @@ -146,204 +123,129 @@ static int xcast(orte_jobid_t job, } /* prep the output buffer */ - OBJ_CONSTRUCT(&buf, opal_buffer_t); + buf = OBJ_NEW(opal_buffer_t); - if (ORTE_SUCCESS != (rc = orte_grpcomm_base_app_pack_xcast(ORTE_DAEMON_PROCESS_AND_RELAY_CMD, - job, &buf, buffer, tag))) { + if (ORTE_SUCCESS != (rc = orte_grpcomm_base_pack_xcast(job, buf, buffer, tag))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } - /* if I am the HNP, just set things up so the cmd processor gets called. - * We don't want to message ourselves as this can create circular logic - * in the RML. Instead, this macro will set a zero-time event which will - * cause the buffer to be processed by the cmd processor - probably will - * fire right away, but that's okay - * The macro makes a copy of the buffer, so it's okay to release it here - */ - if (ORTE_PROC_IS_HNP) { - ORTE_MESSAGE_EVENT(ORTE_PROC_MY_NAME, &buf, ORTE_RML_TAG_DAEMON, orte_daemon_cmd_processor); - } else { - /* otherwise, send it to the HNP for relay */ - if (0 > (rc = orte_rml.send_buffer(ORTE_PROC_MY_HNP, &buf, ORTE_RML_TAG_DAEMON, 0))) { - ORTE_ERROR_LOG(rc); - goto CLEANUP; - } - rc = ORTE_SUCCESS; + /* send it to the HNP (could be myself) for relay */ + if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buf, ORTE_RML_TAG_XCAST, + 0, orte_rml_send_callback, NULL))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(buf); + goto CLEANUP; } + rc = ORTE_SUCCESS; CLEANUP: - OBJ_DESTRUCT(&buf); return rc; } -static void barrier_recv(int status, orte_process_name_t* sender, - opal_buffer_t *buffer, - orte_rml_tag_t tag, void *cbdata) -{ - orte_grpcomm_collective_t *coll = (orte_grpcomm_collective_t*)cbdata; - - OPAL_THREAD_LOCK(&coll->lock); - /* flag as recvd */ - coll->recvd = 1; - opal_condition_broadcast(&coll->cond); - OPAL_THREAD_UNLOCK(&coll->lock); -} - -static int bad_barrier(void) +static int bad_barrier(orte_grpcomm_collective_t *coll) { int rc; - + opal_buffer_t *buf; + orte_namelist_t *nm; + OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output, "%s grpcomm:bad entering barrier", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - /* if I am alone, just return */ + /* if I am alone, just execute the callback */ if (1 == orte_process_info.num_procs) { + coll->active = false; + if (NULL != coll->cbfunc) { + coll->cbfunc(NULL, coll->cbdata); + } return ORTE_SUCCESS; } - /* setup the recv to get the response */ - rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_BARRIER, - ORTE_RML_NON_PERSISTENT, barrier_recv, &barrier); - if (rc != ORTE_SUCCESS) { + /* mark the collective as active */ + coll->active = true; + + /* setup the collective */ + opal_list_append(&orte_grpcomm_base.active_colls, &coll->super); + + if (0 == opal_list_get_size(&coll->participants)) { + /* add a wildcard name to the participants so the daemon knows + * that everyone in my job must participate + */ + nm = OBJ_NEW(orte_namelist_t); + nm->name.jobid = ORTE_PROC_MY_NAME->jobid; + nm->name.vpid = ORTE_VPID_WILDCARD; + opal_list_append(&coll->participants, &nm->super); + } + + /* pack the collective - no data should be involved, but we need + * to ensure we get the header info correct so it can be + * unpacked without error + */ + buf = OBJ_NEW(opal_buffer_t); + orte_grpcomm_base_pack_collective(buf, coll, ORTE_GRPCOMM_INTERNAL_STG_APP); + + /* send the buffer to my daemon */ + if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_DAEMON, buf, ORTE_RML_TAG_COLLECTIVE, + 0, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); + OBJ_RELEASE(buf); + opal_list_remove_item(&orte_grpcomm_base.active_colls, &coll->super); return rc; } - - /* send it and wait for the response */ - if (ORTE_SUCCESS != (rc = orte_grpcomm_base_app_barrier(ORTE_PROC_MY_DAEMON, &barrier))) { - ORTE_ERROR_LOG(rc); - } - - /* don't need to cancel the recv as it only fires once */ OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output, - "%s grpcomm:bad received barrier release", + "%s grpcomm:bad barrier underway", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); return rc; } -static void allgather_recv(int status, orte_process_name_t* sender, - opal_buffer_t *buffer, - orte_rml_tag_t tag, void *cbdata) +static int bad_allgather(orte_grpcomm_collective_t *gather) { - orte_grpcomm_collective_t *coll = (orte_grpcomm_collective_t*)cbdata; int rc; - - OPAL_THREAD_LOCK(&coll->lock); - /* xfer the data */ - if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(&coll->results, buffer))) { - ORTE_ERROR_LOG(rc); - } - /* the daemon returns ALL of our recipients in a single message */ - coll->recvd = orte_process_info.num_procs; - opal_condition_broadcast(&coll->cond); - OPAL_THREAD_UNLOCK(&coll->lock); -} + opal_buffer_t *buf; -static int bad_allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf) -{ - int rc; - OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output, "%s grpcomm:bad entering allgather", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - /* setup to receive results */ - rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ALLGATHER, - ORTE_RML_NON_PERSISTENT, allgather_recv, &allgather); - if (rc != ORTE_SUCCESS) { - ORTE_ERROR_LOG(rc); - return rc; + /* if I am alone, just fire callback */ + if (1 == orte_process_info.num_procs) { + gather->active = false; + if (NULL != gather->cbfunc) { + gather->cbfunc(&gather->buffer, gather->cbdata); + } + return ORTE_SUCCESS; } - /* everyone sends data to their local daemon */ - if (ORTE_SUCCESS != (rc = orte_grpcomm_base_app_allgather(ORTE_PROC_MY_DAEMON, - &allgather, sbuf, rbuf))) { + /* mark the collective as active */ + gather->active = true; + + /* if this is an original request, then record the collective */ + if (NULL == gather->next_cb) { + opal_list_append(&orte_grpcomm_base.active_colls, &gather->super); + } + + /* start the allgather op by sending the data to our daemon - the + * user will have put the data in the "buffer" field + */ + buf = OBJ_NEW(opal_buffer_t); + orte_grpcomm_base_pack_collective(buf, gather, ORTE_GRPCOMM_INTERNAL_STG_APP); + /* send to our daemon */ + if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_DAEMON, buf, + ORTE_RML_TAG_COLLECTIVE, 0, + orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); + OBJ_RELEASE(buf); + opal_list_remove_item(&orte_grpcomm_base.active_colls, &gather->super); return rc; } - /* don't need to cancel the recv as it only fires once */ - OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output, - "%s grpcomm:bad allgather completed", + "%s grpcomm:bad allgather underway", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); return ORTE_SUCCESS; } - -/*** MODEX SECTION ***/ -static int modex(opal_list_t *procs) -{ - int rc; - opal_buffer_t buf, rbuf; - - OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output, - "%s grpcomm:bad: modex entered", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - if (NULL == procs) { - /* This is a modex across our peers at startup. The modex will be realized in the - * background by the daemons. The processes will - * only be informed when all data has been collected from all processes. The get_attr - * will realize the blocking, it will not return until the data has been received. - */ - - OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output, - "%s grpcomm:bad:peer:modex: performing modex", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - /* setup the buffers */ - OBJ_CONSTRUCT(&buf, opal_buffer_t); - OBJ_CONSTRUCT(&rbuf, opal_buffer_t); - - /* put our process name in the buffer so it can be unpacked later */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, ORTE_PROC_MY_NAME, 1, ORTE_NAME))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - - /* pack the entries we have received */ - if (ORTE_SUCCESS != (rc = orte_grpcomm_base_pack_modex_entries(&buf))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - - /* perform the allgather */ - if (ORTE_SUCCESS != (rc = bad_allgather(&buf, &rbuf))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - - /* store the results */ - if( ORTE_SUCCESS != (rc = orte_grpcomm_base_modex_unpack(&rbuf)) ) { - ORTE_ERROR_LOG(rc); - } - - OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output, - "%s grpcomm:bad: modex posted", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - cleanup: - OBJ_DESTRUCT(&buf); - OBJ_DESTRUCT(&rbuf); - - return rc; - } else { - /* this is a modex across a specified list of procs, usually during - * a connect/accept. - */ - if (ORTE_SUCCESS != (rc = orte_grpcomm_base_full_modex(procs))) { - ORTE_ERROR_LOG(rc); - } - } - - OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output, - "%s grpcomm:bad: modex completed", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - return rc; -} diff --git a/orte/mca/grpcomm/base/Makefile.am b/orte/mca/grpcomm/base/Makefile.am index 34644845fb..185558af83 100644 --- a/orte/mca/grpcomm/base/Makefile.am +++ b/orte/mca/grpcomm/base/Makefile.am @@ -9,6 +9,8 @@ # University of Stuttgart. All rights reserved. # Copyright (c) 2004-2005 The Regents of the University of California. # All rights reserved. +# Copyright (c) 2011 Los Alamos National Security, LLC. +# All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -27,9 +29,7 @@ libmca_grpcomm_la_SOURCES += \ if !ORTE_DISABLE_FULL_SUPPORT libmca_grpcomm_la_SOURCES += \ - base/grpcomm_base_allgather.c \ base/grpcomm_base_modex.c \ - base/grpcomm_base_coll.c \ - base/grpcomm_base_app_fns.c - + base/grpcomm_base_receive.c \ + base/grpcomm_base_xcast.c endif diff --git a/orte/mca/grpcomm/base/base.h b/orte/mca/grpcomm/base/base.h index d4d2daf3b4..eba414b639 100644 --- a/orte/mca/grpcomm/base/base.h +++ b/orte/mca/grpcomm/base/base.h @@ -9,6 +9,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -49,9 +51,6 @@ ORTE_DECLSPEC int orte_grpcomm_base_open(void); ORTE_DECLSPEC int orte_grpcomm_base_select(void); ORTE_DECLSPEC int orte_grpcomm_base_close(void); -/* daemon collective function */ -typedef void (*orte_grpcomm_daemon_collective_fn_t)(orte_process_name_t *sender, - opal_buffer_t *data); /* * globals that might be needed */ @@ -60,7 +59,8 @@ typedef struct { bool selected; opal_list_t components_available; orte_grpcomm_base_component_t selected_component; - orte_grpcomm_daemon_collective_fn_t daemon_coll; + orte_grpcomm_coll_id_t coll_id; + opal_list_t active_colls; #if OPAL_HAVE_HWLOC hwloc_cpuset_t working_cpuset; #endif @@ -68,30 +68,23 @@ typedef struct { ORTE_DECLSPEC extern orte_grpcomm_base_t orte_grpcomm_base; -/* structure for tracking collective operations */ -typedef struct { - opal_object_t super; - opal_mutex_t lock; - opal_condition_t cond; - orte_vpid_t recvd; - opal_buffer_t results; -} orte_grpcomm_collective_t; -ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_grpcomm_collective_t); +ORTE_DECLSPEC orte_grpcomm_collective_t* orte_grpcomm_base_setup_collective(orte_grpcomm_coll_id_t id); +ORTE_DECLSPEC void orte_grpcomm_base_progress_collectives(void); +ORTE_DECLSPEC orte_grpcomm_coll_id_t orte_grpcomm_base_get_coll_id(void); +ORTE_DECLSPEC void orte_grpcomm_base_pack_collective(opal_buffer_t *relay, + orte_grpcomm_collective_t *coll, + orte_grpcomm_internal_stage_t stg); -/* - * Base functions - */ -ORTE_DECLSPEC int orte_grpcomm_base_allgather_list(opal_list_t *names, - opal_buffer_t *sbuf, - opal_buffer_t *rbuf); +/* modex support */ ORTE_DECLSPEC int orte_grpcomm_base_set_proc_attr(const char *attr_name, const void *data, size_t size); ORTE_DECLSPEC int orte_grpcomm_base_get_proc_attr(const orte_process_name_t proc, const char * attribute_name, void **val, size_t *size); -ORTE_DECLSPEC int orte_grpcomm_base_modex_unpack( opal_buffer_t* rbuf); -ORTE_DECLSPEC int orte_grpcomm_base_full_modex(opal_list_t *procs); +ORTE_DECLSPEC void orte_grpcomm_base_store_peer_modex(opal_buffer_t *rbuf, void *cbdata); +ORTE_DECLSPEC void orte_grpcomm_base_store_modex(opal_buffer_t *rbuf, void *cbdata); +ORTE_DECLSPEC int orte_grpcomm_base_modex(orte_grpcomm_collective_t *modex); ORTE_DECLSPEC int orte_grpcomm_base_purge_proc_attrs(void); ORTE_DECLSPEC int orte_grpcomm_base_modex_init(void); ORTE_DECLSPEC void orte_grpcomm_base_modex_finalize(void); @@ -101,30 +94,16 @@ ORTE_DECLSPEC int orte_grpcomm_base_update_modex_entries(orte_process_name_t * ORTE_DECLSPEC int orte_grpcomm_base_load_modex_data(orte_process_name_t *proc, char *attribute_name, void *data, int num_bytes); -/* app functions */ -ORTE_DECLSPEC int orte_grpcomm_base_app_barrier(orte_process_name_t *recipient, - orte_grpcomm_collective_t *coll); -ORTE_DECLSPEC int orte_grpcomm_base_app_allgather(orte_process_name_t *recipient, - orte_grpcomm_collective_t *coll, - opal_buffer_t *sbuf, - opal_buffer_t *rbuf); -ORTE_DECLSPEC int orte_grpcomm_base_app_pack_xcast(orte_daemon_cmd_flag_t cmd, - orte_jobid_t job, - opal_buffer_t *buffer, - opal_buffer_t *message, - orte_rml_tag_t tag); - -/* Tuned collectives */ -ORTE_DECLSPEC void orte_grpcomm_base_coll_recv(int status, orte_process_name_t* sender, - opal_buffer_t* buffer, orte_rml_tag_t tag, - void* cbdata); -ORTE_DECLSPEC int orte_grpcomm_base_allgather(opal_buffer_t *sendbuf, opal_buffer_t *recvbuf, int32_t num_entries, - orte_jobid_t jobid, orte_vpid_t np, orte_vpid_t *vpids); -ORTE_DECLSPEC void orte_grpcomm_base_daemon_coll_recv(int status, orte_process_name_t* sender, - opal_buffer_t* buffer, orte_rml_tag_t tag, - void* cbdata); -ORTE_DECLSPEC void orte_grpcomm_base_daemon_collective(orte_process_name_t *sender, - opal_buffer_t *data); +/* comm support */ +ORTE_DECLSPEC int orte_grpcomm_base_comm_start(void); +ORTE_DECLSPEC void orte_grpcomm_base_comm_stop(void); +ORTE_DECLSPEC void orte_grpcomm_base_xcast_recv(int status, orte_process_name_t* sender, + opal_buffer_t* buffer, orte_rml_tag_t tag, + void* cbdata); +ORTE_DECLSPEC int orte_grpcomm_base_pack_xcast(orte_jobid_t job, + opal_buffer_t *buffer, + opal_buffer_t *message, + orte_rml_tag_t tag); END_C_DECLS #endif diff --git a/orte/mca/grpcomm/base/grpcomm_base_allgather.c b/orte/mca/grpcomm/base/grpcomm_base_allgather.c deleted file mode 100644 index 9477eadd9a..0000000000 --- a/orte/mca/grpcomm/base/grpcomm_base_allgather.c +++ /dev/null @@ -1,263 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "orte_config.h" -#include "orte/constants.h" -#include "orte/types.h" - -#include -#ifdef HAVE_SYS_TIME_H -#include -#endif /* HAVE_SYS_TIME_H */ - -#include "opal/util/output.h" - -#include "opal/dss/dss.h" -#include "orte/mca/errmgr/errmgr.h" -#include "orte/mca/rml/rml.h" -#include "orte/mca/rml/rml_types.h" -#include "orte/runtime/orte_globals.h" -#include "orte/util/name_fns.h" -#include "orte/orted/orted.h" -#include "orte/runtime/orte_wait.h" - -#include "orte/mca/grpcomm/base/base.h" - -static bool allgather_failed; -static orte_std_cntr_t allgather_num_recvd; -static opal_buffer_t *allgather_buf; - -static void allgather_server_recv(int status, orte_process_name_t* sender, - opal_buffer_t *buffer, - orte_rml_tag_t tag, void *cbdata) -{ - int rc; - - OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output, - "%s allgather buffer received from %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(sender))); - - /* append this data to the allgather_buf */ - if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(allgather_buf, buffer))) { - ORTE_ERROR_LOG(rc); - allgather_failed = true; - return; - } - - /* bump the counter */ - ++allgather_num_recvd; - - /* reissue the recv */ - rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ALLGATHER_LIST, - ORTE_RML_NON_PERSISTENT, allgather_server_recv, NULL); - if (rc != ORTE_SUCCESS && rc != ORTE_ERR_NOT_IMPLEMENTED) { - ORTE_ERROR_LOG(rc); - allgather_failed = true; - } -} - -static void allgather_client_recv(int status, orte_process_name_t* sender, - opal_buffer_t *buffer, - orte_rml_tag_t tag, void *cbdata) -{ - int rc; - - OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output, - "%s grpcomm:base: allgather buffer received", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - /* transfer the buffer */ - if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(allgather_buf, buffer))) { - ORTE_ERROR_LOG(rc); - allgather_failed = true; - } - - /* bump the counter */ - ++allgather_num_recvd; -} - -static orte_std_cntr_t allgather_num_sent; -static void allgather_send_cb(int status, orte_process_name_t* sender, - opal_buffer_t *buffer, - orte_rml_tag_t tag, void *cbdata) -{ - /* increment the count */ - ++allgather_num_sent; -} - - -int orte_grpcomm_base_allgather_list(opal_list_t *names, opal_buffer_t *sbuf, opal_buffer_t *rbuf) -{ - opal_list_item_t *item; - orte_namelist_t *peer, *root; - int32_t num_peers; - int rc; - - OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output, - "%s grpcomm: entering allgather_list", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - /* the first entry on the list is the "root" that collects - * all the data - everyone else just sends and gets back - * the results - */ - root = (orte_namelist_t*)opal_list_get_first(names); - - /*** NON-ROOT ***/ - if (OPAL_EQUAL != opal_dss.compare(&root->name, ORTE_PROC_MY_NAME, ORTE_NAME)) { - /* everyone but root sends data */ - OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output, - "%s allgather_list: sending my data to %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&root->name))); - - if (0 > orte_rml.send_buffer(&root->name, sbuf, ORTE_RML_TAG_ALLGATHER_LIST, 0)) { - ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); - return ORTE_ERR_COMM_FAILURE; - } - - OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output, - "%s allgather_list: buffer sent", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - /* setup the buffer that will recv the results */ - allgather_buf = OBJ_NEW(opal_buffer_t); - - /* now receive the final result from rank=0. Be sure to do this in - * a manner that allows us to return without being in a recv! - */ - allgather_num_recvd = 0; - allgather_failed = false; - rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ALLGATHER_LIST, - ORTE_RML_NON_PERSISTENT, allgather_client_recv, NULL); - if (rc != ORTE_SUCCESS && rc != ORTE_ERR_NOT_IMPLEMENTED) { - ORTE_ERROR_LOG(rc); - return rc; - } - - ORTE_PROGRESSED_WAIT(allgather_failed, allgather_num_recvd, 1); - - /* cancel the lingering recv */ - orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ALLGATHER_LIST); - - /* if the allgather failed, return an error */ - if (allgather_failed) { - ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); - OBJ_RELEASE(allgather_buf); - return ORTE_ERR_COMM_FAILURE; - } - - /* copy payload to the caller's buffer */ - if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(rbuf, allgather_buf))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(allgather_buf); - return rc; - } - OBJ_RELEASE(allgather_buf); - - OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output, - "%s allgather_list: buffer received", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - return ORTE_SUCCESS; - } - - - /*** ROOT ***/ - /* count how many peers are participating, including myself */ - num_peers = (int32_t)opal_list_get_size(names); - - /* seed the outgoing buffer with the num_procs so it can be unpacked */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(rbuf, &num_peers, 1, OPAL_INT32))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /* put my own information into the outgoing buffer */ - if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(rbuf, sbuf))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /* setup the recv conditions */ - allgather_failed = false; - allgather_num_recvd = 0; - - /* setup the buffer that will recv the results */ - allgather_buf = OBJ_NEW(opal_buffer_t); - - OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output, - "%s allgather_list: waiting to recv %ld inputs", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - (long)num_peers-1)); - - /* post the non-blocking recv */ - rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ALLGATHER_LIST, - ORTE_RML_NON_PERSISTENT, allgather_server_recv, NULL); - if (rc != ORTE_SUCCESS && rc != ORTE_ERR_NOT_IMPLEMENTED) { - ORTE_ERROR_LOG(rc); - return rc; - } - - ORTE_PROGRESSED_WAIT(allgather_failed, allgather_num_recvd, num_peers-1); - - /* cancel the lingering recv */ - orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ALLGATHER_LIST); - - OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output, - "%s allgather_list: received all data", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - /* copy the received info to the caller's buffer */ - if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(rbuf, allgather_buf))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(allgather_buf); - return rc; - } - OBJ_RELEASE(allgather_buf); - - /* broadcast the results */ - allgather_num_sent = 0; - for (item = opal_list_get_first(names); - item != opal_list_get_end(names); - item = opal_list_get_next(item)) { - peer = (orte_namelist_t*)item; - - /* skip myself */ - if (OPAL_EQUAL == opal_dss.compare(&root->name, &peer->name, ORTE_NAME)) { - continue; - } - - /* transmit the buffer to this process */ - if (0 > orte_rml.send_buffer_nb(&peer->name, rbuf, ORTE_RML_TAG_ALLGATHER_LIST, - 0, allgather_send_cb, 0)) { - ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); - return ORTE_ERR_COMM_FAILURE; - } - } - - ORTE_PROGRESSED_WAIT(false, allgather_num_sent, num_peers-1); - - OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output, - "%s grpcomm: allgather_list completed", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - return ORTE_SUCCESS; -} diff --git a/orte/mca/grpcomm/base/grpcomm_base_app_fns.c b/orte/mca/grpcomm/base/grpcomm_base_app_fns.c deleted file mode 100644 index e3b8d2a070..0000000000 --- a/orte/mca/grpcomm/base/grpcomm_base_app_fns.c +++ /dev/null @@ -1,220 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; -*- */ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2009 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2009 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved. - * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "orte_config.h" -#include "orte/constants.h" -#include "orte/types.h" - -#include -#ifdef HAVE_SYS_TIME_H -#include -#endif /* HAVE_SYS_TIME_H */ - -#include "opal/util/output.h" -#include "opal/class/opal_hash_table.h" -#include "opal/dss/dss.h" -#include "opal/threads/mutex.h" -#include "opal/threads/condition.h" - -#include "orte/util/proc_info.h" -#include "orte/mca/errmgr/errmgr.h" -#include "orte/util/name_fns.h" -#include "orte/mca/rml/rml.h" -#include "orte/mca/odls/odls_types.h" - -#include "orte/mca/grpcomm/base/base.h" - -int orte_grpcomm_base_app_pack_xcast(orte_daemon_cmd_flag_t cmd, - orte_jobid_t job, - opal_buffer_t *buffer, - opal_buffer_t *message, - orte_rml_tag_t tag) -{ - orte_daemon_cmd_flag_t command; - int rc; - - /* pack the base cmd for the daemon/HNP */ - command = cmd; - if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &command, 1, ORTE_DAEMON_CMD))) { - ORTE_ERROR_LOG(rc); - goto CLEANUP; - } - /* pack the target jobid and tag for use in relay */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &job, 1, ORTE_JOBID))) { - ORTE_ERROR_LOG(rc); - goto CLEANUP; - } - if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &tag, 1, ORTE_RML_TAG))) { - ORTE_ERROR_LOG(rc); - goto CLEANUP; - } - - /* if this isn't intended for the daemon command tag, then we better - * tell the daemon to deliver it to the procs, and what job is supposed - * to get it - this occurs when a caller just wants to send something - * to all the procs in a job. In that use-case, the caller doesn't know - * anything about inserting daemon commands or what routing algo might - * be used, so we have to help them out a little. Functions that are - * sending commands to the daemons themselves are smart enough to know - * what they need to do. - */ - if (ORTE_RML_TAG_DAEMON != tag) { - command = ORTE_DAEMON_MESSAGE_LOCAL_PROCS; - if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &command, 1, ORTE_DAEMON_CMD))) { - ORTE_ERROR_LOG(rc); - goto CLEANUP; - } - if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &job, 1, ORTE_JOBID))) { - ORTE_ERROR_LOG(rc); - goto CLEANUP; - } - if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &tag, 1, ORTE_RML_TAG))) { - ORTE_ERROR_LOG(rc); - goto CLEANUP; - } - } - - /* copy the payload into the new buffer - this is non-destructive, so our - * caller is still responsible for releasing any memory in the buffer they - * gave to us - */ - if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(buffer, message))) { - ORTE_ERROR_LOG(rc); - goto CLEANUP; - } - -CLEANUP: - return ORTE_SUCCESS; -} - -int orte_grpcomm_base_app_barrier(orte_process_name_t *recipient, - orte_grpcomm_collective_t *coll) -{ - int rc; - opal_buffer_t buf; - orte_rml_tag_t tag=ORTE_RML_TAG_BARRIER; - - OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output, - "%s grpcomm:app entering barrier", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - OBJ_CONSTRUCT(&buf, opal_buffer_t); - /* add the barrier tag */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &tag, 1, ORTE_RML_TAG))) { - ORTE_ERROR_LOG(rc); - OBJ_DESTRUCT(&buf); - return rc; - } - - /* send the buffer to recipient */ - if (0 > (rc = orte_rml.send_buffer(recipient, &buf, ORTE_RML_TAG_DAEMON_COLLECTIVE, 0))) { - ORTE_ERROR_LOG(rc); - OBJ_DESTRUCT(&buf); - return rc; - } - OBJ_DESTRUCT(&buf); - - /* wait to complete */ - OPAL_THREAD_LOCK(&coll->lock); - while (0 == coll->recvd) { - opal_condition_wait(&coll->cond, &coll->lock); - } - coll->recvd = 0; /* reset for next time */ - OPAL_THREAD_UNLOCK(&coll->lock); - - OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output, - "%s grpcomm:app received barrier release", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - return ORTE_SUCCESS; -} - -int orte_grpcomm_base_app_allgather(orte_process_name_t *recipient, - orte_grpcomm_collective_t *coll, - opal_buffer_t *sbuf, - opal_buffer_t *rbuf) -{ - int rc; - opal_buffer_t buf; - orte_rml_tag_t tag=ORTE_RML_TAG_ALLGATHER; - int32_t nc; - - OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output, - "%s grpcomm:app entering allgather", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - - /* if I am alone, just copy data across and return */ - if (1 == orte_process_info.num_procs) { - /* since we won't be going through the daemon collective, - * we have to pack num_contributors=1 so that - * things will unpack correctly - */ - nc = 1; - opal_dss.pack(rbuf, &nc, 1, OPAL_INT32); - opal_dss.copy_payload(rbuf, sbuf); - return ORTE_SUCCESS; - } - - /* everyone sends data to their local daemon */ - OBJ_CONSTRUCT(&buf, opal_buffer_t); - /* add the allgather tag */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &tag, 1, ORTE_RML_TAG))) { - ORTE_ERROR_LOG(rc); - OBJ_DESTRUCT(&buf); - return rc; - } - /* add our data to it */ - if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(&buf, sbuf))) { - ORTE_ERROR_LOG(rc); - OBJ_DESTRUCT(&buf); - return rc; - } - /* send to recipient */ - if (0 > (rc = orte_rml.send_buffer(recipient, &buf, ORTE_RML_TAG_DAEMON_COLLECTIVE, 0))) { - ORTE_ERROR_LOG(rc); - OBJ_DESTRUCT(&buf); - return rc; - } - OBJ_DESTRUCT(&buf); - - OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output, - "%s grpcomm:app allgather buffer sent", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - /* wait to complete */ - OPAL_THREAD_LOCK(&coll->lock); - while (coll->recvd < orte_process_info.num_procs) { - opal_condition_wait(&coll->cond, &coll->lock); - } - /* xfer the collected data */ - opal_dss.copy_payload(rbuf, &coll->results); - /* reset for next time */ - OBJ_DESTRUCT(&coll->results); - OBJ_CONSTRUCT(&coll->results, opal_buffer_t); - coll->recvd = 0; - OPAL_THREAD_UNLOCK(&coll->lock); - - OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output, - "%s grpcomm:app allgather completed", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - return ORTE_SUCCESS; -} diff --git a/orte/mca/grpcomm/base/grpcomm_base_coll.c b/orte/mca/grpcomm/base/grpcomm_base_coll.c deleted file mode 100644 index 0275ac01e1..0000000000 --- a/orte/mca/grpcomm/base/grpcomm_base_coll.c +++ /dev/null @@ -1,923 +0,0 @@ -/* - * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2011 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2006-2008 Los Alamos National Security, LLC. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - * - */ - -#include "orte_config.h" -#include "orte/types.h" - - -#include "orte_config.h" -#include "orte/constants.h" -#include "orte/types.h" - -#include -#ifdef HAVE_SYS_TIME_H -#include -#endif /* HAVE_SYS_TIME_H */ - -#include "opal/util/output.h" -#include "opal/dss/dss.h" -#include "opal/mca/event/event.h" - -#include "orte/mca/errmgr/errmgr.h" -#include "orte/mca/ess/ess.h" -#include "orte/mca/odls/base/base.h" -#include "orte/mca/rml/rml.h" -#include "orte/mca/rml/rml_types.h" -#include "orte/mca/routed/routed.h" -#include "orte/runtime/orte_globals.h" -#include "orte/util/name_fns.h" -#include "orte/orted/orted.h" -#include "orte/runtime/orte_wait.h" - -#include "orte/mca/grpcomm/base/base.h" - -/*************** TUNED COLLECTIVES FOR GRPCOMM MODULES **************/ - -/**** AVAILABLE ALGORITHMS ****/ -static int twoproc(opal_buffer_t *sendbuf, opal_buffer_t *recvbuf, int32_t num_entries, - orte_jobid_t jobid, orte_vpid_t *vpids); -static int bruck(opal_buffer_t *sendbuf, opal_buffer_t *recvbuf, int32_t num_entries, - orte_jobid_t jobid, orte_vpid_t np, orte_vpid_t *vpids); -static int recursivedoubling(opal_buffer_t *sendbuf, opal_buffer_t *recvbuf, int32_t num_entries, - orte_jobid_t jobid, orte_vpid_t np, orte_vpid_t *vpids); - -/**** LOCAL VARIABLES USED IN COLLECTIVES ****/ -static int num_recvd; -static opal_buffer_t bucket; - -/* Receive and process collective messages */ -static void process_coll_msg(int fd, short event, void *data) -{ - orte_message_event_t *mev = (orte_message_event_t*)data; - - /* transfer the data to the collecting bucket */ - opal_dss.copy_payload(&bucket, mev->buffer); - - /* cleanup */ - OBJ_RELEASE(mev); - - /* increment the number recvd */ - num_recvd++; -} - -void orte_grpcomm_base_coll_recv(int status, orte_process_name_t* sender, - opal_buffer_t* buffer, orte_rml_tag_t tag, - void* cbdata) -{ - OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, - "%s grpcomm:coll:receive got message from %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(sender))); - - /* don't process this right away - we need to get out of the recv before - * we process the message as it may ask us to do something that involves - * more messaging! Instead, setup an event so that the message gets processed - * as soon as we leave the recv. - * - * The macro makes a copy of the buffer, which we release above - the incoming - * buffer, however, is NOT released here, although its payload IS transferred - * to the message buffer for later processing - */ - ORTE_MESSAGE_EVENT(sender, buffer, tag, process_coll_msg); - - return; -} - -/* - * Switchyard for selecting the collective algorithm to use - */ -int orte_grpcomm_base_allgather(opal_buffer_t *sendbuf, opal_buffer_t *recvbuf, int32_t num_entries, - orte_jobid_t jobid, orte_vpid_t np, orte_vpid_t *vpids) -{ - bool has_one; - orte_vpid_t n; - - OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, - "%s grpcomm:coll:allgather called with %d entries np %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - num_entries, (int)np)); - - /* if we only have one proc participating, just copy the data across and return */ - if (1 == np) { - opal_dss.pack(recvbuf, &num_entries, 1, OPAL_INT32); - return opal_dss.copy_payload(recvbuf, sendbuf); - } - - if (2 == np) { - /* only two procs in collective */ - return twoproc(sendbuf, recvbuf, num_entries, jobid, vpids); - } - - /* if we have power of 2 participants, use recursive doubling - otherwise, - * use bruck algorithm - */ - has_one = false; - n = np; - for ( ; n > 0; n >>= 1) { - if (n & 0x1) { - if (has_one) { - return bruck(sendbuf, recvbuf, num_entries, jobid, np, vpids); - } - has_one = true; - } - } - - /* must be power of two! */ - return recursivedoubling(sendbuf, recvbuf, num_entries, jobid, np, vpids); -} - - -/* - * The Two-Proc Algorithm - * - * One sends to zero, zero waits to recv from one - * Zero adds its data to message, sends result back to one - */ -static int twoproc(opal_buffer_t *sendbuf, opal_buffer_t *recvbuf, int32_t num_entries, - orte_jobid_t jobid, orte_vpid_t *vpids) -{ - orte_process_name_t peer; - int32_t num_remote, cnt; - int rc; - opal_buffer_t buf; - - peer.jobid = jobid; - - OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, - "%s grpcomm:coll:two-proc algo employed", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - if (vpids[0] == ORTE_PROC_MY_NAME->vpid) { - /* I send first */ - peer.vpid = vpids[1]; - ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer)); - - /* setup a temp buffer so I can inform the other side as to the - * number of entries in my buffer - */ - OBJ_CONSTRUCT(&buf, opal_buffer_t); - opal_dss.pack(&buf, &num_entries, 1, OPAL_INT32); - opal_dss.copy_payload(&buf, sendbuf); - OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, - "%s grpcomm:coll:two-proc sending to %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&peer))); - - if (0 > (rc = orte_rml.send_buffer(&peer, &buf, ORTE_RML_TAG_DAEMON_COLLECTIVE, 0))) { - ORTE_ERROR_LOG(rc); - return rc; - } - OBJ_DESTRUCT(&buf); - - /* wait for reply */ - num_recvd = 0; - OBJ_CONSTRUCT(&bucket, opal_buffer_t); - if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, - ORTE_RML_TAG_DAEMON_COLLECTIVE, - ORTE_RML_NON_PERSISTENT, - orte_grpcomm_base_coll_recv, - NULL))) { - ORTE_ERROR_LOG(rc); - } - - ORTE_PROGRESSED_WAIT(false, num_recvd, 1); - OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, - "%s grpcomm:coll:two-proc got my return message", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - } else { - /* if I am not the start, then I recv first */ - num_recvd = 0; - OBJ_CONSTRUCT(&bucket, opal_buffer_t); - if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, - ORTE_RML_TAG_DAEMON_COLLECTIVE, - ORTE_RML_NON_PERSISTENT, - orte_grpcomm_base_coll_recv, - NULL))) { - ORTE_ERROR_LOG(rc); - } - - ORTE_PROGRESSED_WAIT(false, num_recvd, 1); - OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, - "%s grpcomm:coll:two-proc got my starting message", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - /* send my data back */ - OBJ_CONSTRUCT(&buf, opal_buffer_t); - opal_dss.pack(&buf, &num_entries, 1, OPAL_INT32); - opal_dss.copy_payload(&buf, sendbuf); - peer.vpid = vpids[0]; - ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer)); - - OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, - "%s grpcomm:coll:two-proc sending to %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&peer))); - if (0 > (rc = orte_rml.send_buffer(&peer, &buf, ORTE_RML_TAG_DAEMON_COLLECTIVE, 0))) { - ORTE_ERROR_LOG(rc); - return rc; - } - OBJ_DESTRUCT(&buf); - } - - /* extract the number of entries in the remote buffer */ - cnt = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(&bucket, &num_remote, &cnt, OPAL_INT32))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /* output of a collective begins with the total number of entries */ - num_remote += num_entries; - if (ORTE_SUCCESS != (rc = opal_dss.pack(recvbuf, &num_remote, 1, OPAL_INT32))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /* xfer my data */ - opal_dss.copy_payload(recvbuf, sendbuf); - /* xfer the recvd data */ - opal_dss.copy_payload(recvbuf, &bucket); - - /* cleanup */ - OBJ_DESTRUCT(&bucket); - - return ORTE_SUCCESS; -} - - -/* For a complete description of this algorithm, please look at - * ompi/mca/coll/tuned/coll_tuned_allgather.c - */ -static int bruck(opal_buffer_t *sendbuf, opal_buffer_t *recvbuf, int32_t num_entries, - orte_jobid_t jobid, orte_vpid_t np, orte_vpid_t *vpids) -{ - orte_vpid_t rank, distance, nv; - orte_process_name_t peer; - int32_t num_remote, total_entries, cnt; - opal_buffer_t collection, buf; - int rc; - - OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, - "%s grpcomm:coll:bruck algo employed", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - /* initialize */ - total_entries = num_entries; - - /* start by seeding the collection with our own data */ - OBJ_CONSTRUCT(&collection, opal_buffer_t); - opal_dss.copy_payload(&collection, sendbuf); - - /* collective is constrained to take place within the specified jobid */ - peer.jobid = jobid; - - /* Communication step: - At every step i, rank r: - - doubles the distance - - sends message containing all data collected so far to rank r - distance - - receives message containing all data collected so far from rank (r + distance) - */ - /* find my position in the group of participants. This - * value is the "rank" we will use in the algo - */ - rank = ORTE_VPID_INVALID; - for (nv=0; nv < np; nv++) { - if (vpids[nv] == ORTE_PROC_MY_NAME->vpid) { - rank = nv; - break; - } - } - - /* check for bozo case */ - if (ORTE_VPID_INVALID == rank) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - return ORTE_ERR_NOT_FOUND; - } - - for (distance = 1; distance < np; distance <<= 1) { - - /* first send my current contents */ - nv = (rank - distance + np) % np; - peer.vpid = vpids[nv]; - ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer)); - - OBJ_CONSTRUCT(&buf, opal_buffer_t); - opal_dss.pack(&buf, &total_entries, 1, OPAL_INT32); - opal_dss.copy_payload(&buf, &collection); - OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, - "%s grpcomm:coll:bruck sending to %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&peer))); - if (0 > (rc = orte_rml.send_buffer(&peer, &buf, ORTE_RML_TAG_DAEMON_COLLECTIVE, 0))) { - ORTE_ERROR_LOG(rc); - return rc; - } - OBJ_DESTRUCT(&buf); - - /* now setup to recv from my other partner */ - num_recvd = 0; - nv = (rank + distance) % np; - peer.vpid = vpids[nv]; - ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer)); - - OBJ_CONSTRUCT(&bucket, opal_buffer_t); - if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(&peer, - ORTE_RML_TAG_DAEMON_COLLECTIVE, - ORTE_RML_NON_PERSISTENT, - orte_grpcomm_base_coll_recv, - NULL))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* and wait for it to get here */ - ORTE_PROGRESSED_WAIT(false, num_recvd, 1); - - /* extract the number of entries in the remote buffer */ - cnt = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(&bucket, &num_remote, &cnt, OPAL_INT32))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /* add it to our running total */ - total_entries += num_remote; - - /* transfer the data to our collection */ - opal_dss.copy_payload(&collection, &bucket); - - /* cleanup */ - OBJ_DESTRUCT(&bucket); - } - - /* output of a collective begins with the total number of entries */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(recvbuf, &total_entries, 1, OPAL_INT32))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /* transfer the collected data */ - opal_dss.copy_payload(recvbuf, &collection); - - /* cleanup */ - OBJ_DESTRUCT(&collection); - - return ORTE_SUCCESS; -} - -/* For a complete description of this algorithm, please look at - * ompi/mca/coll/tuned/coll_tuned_allgather.c - */ -static int recursivedoubling(opal_buffer_t *sendbuf, opal_buffer_t *recvbuf, int32_t num_entries, - orte_jobid_t jobid, orte_vpid_t np, orte_vpid_t *vpids) -{ - orte_vpid_t rank, distance, nv; - int32_t num_remote, total_entries, cnt; - opal_buffer_t collection, buf; - orte_process_name_t peer; - int rc; - - OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, - "%s grpcomm:coll:recdub algo employed", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - /* initialize */ - total_entries = num_entries; - - /* start by seeding the collection with our own data */ - OBJ_CONSTRUCT(&collection, opal_buffer_t); - opal_dss.copy_payload(&collection, sendbuf); - - /* collective is constrained to take place within the specified jobid */ - peer.jobid = jobid; - - /* Communication step: - At every step i, rank r: - - exchanges message containing all data collected so far with rank peer = (r ^ 2^i). - */ - /* find my position in the group of participants. This - * value is the "rank" we will use in the algo - */ - rank = ORTE_VPID_INVALID; - for (nv=0; nv < np; nv++) { - if (vpids[nv] == ORTE_PROC_MY_NAME->vpid) { - rank = nv; - break; - } - } - - /* check for bozo case */ - if (ORTE_VPID_INVALID == rank) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - return ORTE_ERR_NOT_FOUND; - } - - for (distance = 0x1; distance < np; distance<<=1) { - - /* first send my current contents */ - nv = rank ^ distance; - peer.vpid = vpids[nv]; - ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer)); - - OBJ_CONSTRUCT(&buf, opal_buffer_t); - opal_dss.pack(&buf, &total_entries, 1, OPAL_INT32); - opal_dss.copy_payload(&buf, &collection); - OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, - "%s grpcomm:coll:recdub sending to %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&peer))); - if (0 > (rc = orte_rml.send_buffer(&peer, &buf, ORTE_RML_TAG_DAEMON_COLLECTIVE, 0))) { - ORTE_ERROR_LOG(rc); - return rc; - } - OBJ_DESTRUCT(&buf); - - /* now setup to recv from my other partner */ - num_recvd = 0; - OBJ_CONSTRUCT(&bucket, opal_buffer_t); - if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(&peer, - ORTE_RML_TAG_DAEMON_COLLECTIVE, - ORTE_RML_NON_PERSISTENT, - orte_grpcomm_base_coll_recv, - NULL))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* and wait for it to get here */ - ORTE_PROGRESSED_WAIT(false, num_recvd, 1); - - /* extract the number of entries in the remote buffer */ - cnt = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(&bucket, &num_remote, &cnt, OPAL_INT32))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /* add it to our running total */ - total_entries += num_remote; - - /* transfer the data to our collection */ - opal_dss.copy_payload(&collection, &bucket); - - /* cleanup */ - OBJ_DESTRUCT(&bucket); - } - - /* output of a collective begins with the total number of entries */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(recvbuf, &total_entries, 1, OPAL_INT32))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /* transfer the collected data */ - opal_dss.copy_payload(recvbuf, &collection); - - /* cleanup */ - OBJ_DESTRUCT(&collection); - - return ORTE_SUCCESS; -} - -/**** DAEMON COLLECTIVE SUPPORT ****/ - -static void reset_child_participation(orte_jobid_t job) -{ - opal_list_item_t *item; - orte_odls_child_t *child; - - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; - - /* is this child part of the specified job? */ - if (child->name->jobid == job) { - /* clear flag */ - child->coll_recvd = false; - } - } -} - -static bool all_children_participated(orte_jobid_t job) -{ - opal_list_item_t *item; - orte_odls_child_t *child; - - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; - - /* is this child part of the specified job? */ - if (child->name->jobid == job && !child->coll_recvd) { - /* if this child has *not* participated yet, return false */ - return false; - } - } - - /* if we get here, then everyone in the job has participated */ - return true; - -} - -void orte_grpcomm_base_daemon_collective(orte_process_name_t *sender, - opal_buffer_t *data) -{ - orte_jobid_t jobid; - orte_odls_job_t *jobdat; - orte_routed_tree_t *child; - orte_std_cntr_t n; - opal_list_t daemon_tree; - opal_list_item_t *item, *next; - int32_t num_contributors; - opal_buffer_t buf; - orte_process_name_t my_parent, proc; - orte_vpid_t daemonvpid; - int rc; - int32_t numc; - orte_rml_tag_t rmltag; - - OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, - "%s grpcomm:base:daemon_coll: daemon collective called", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - /* unpack the jobid using this collective */ - n = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobid, &n, ORTE_JOBID))) { - ORTE_ERROR_LOG(rc); - return; - } - - /* lookup the job record for it */ - jobdat = NULL; - for (item = opal_list_get_first(&orte_local_jobdata); - item != opal_list_get_end(&orte_local_jobdata); - item = opal_list_get_next(item)) { - jobdat = (orte_odls_job_t*)item; - - /* is this the specified job? */ - if (jobdat->jobid == jobid) { - break; - } - } - if (NULL == jobdat) { - /* race condition - someone sent us a collective before we could - * parse the add_local_procs cmd. Just add the jobdat object - * and continue - */ - jobdat = OBJ_NEW(orte_odls_job_t); - jobdat->jobid = jobid; - opal_list_append(&orte_local_jobdata, &jobdat->super); - } - - /* it may be possible to get here prior to having actually finished processing our - * local launch msg due to the race condition between different nodes and when - * they start their individual procs. Hence, we have to first ensure that we - * -have- finished processing the launch msg, or else we won't know whether - * or not to wait before sending this on - */ - OPAL_THREAD_LOCK(&jobdat->lock); - while (!jobdat->launch_msg_processed) { - opal_condition_wait(&jobdat->cond, &jobdat->lock); - } - OPAL_THREAD_UNLOCK(&jobdat->lock); - - /* unpack the tag for this collective */ - n = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &rmltag, &n, ORTE_RML_TAG))) { - ORTE_ERROR_LOG(rc); - return; - } - - /* unpack the number of contributors in this data bucket */ - n = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &num_contributors, &n, OPAL_INT32))) { - ORTE_ERROR_LOG(rc); - return; - } - jobdat->num_contributors += num_contributors; - - /* xfer the data */ - opal_dss.copy_payload(&jobdat->collection_bucket, data); - - /* count the number of participants collected */ - jobdat->num_collected++; - - /* if we haven't already done so, figure out how many participants we - * should be expecting - */ - if (jobdat->num_participating < 0) { - if (0 < jobdat->num_local_procs) { - /* we have children, so account for our own participation */ - jobdat->num_participating = 1; - } else { - jobdat->num_participating = 0; - } - /* now see if anyone else will be sending us something */ - OBJ_CONSTRUCT(&daemon_tree, opal_list_t); - orte_routed.get_routing_tree(&daemon_tree); - /* unfortunately, there is no simple way to determine which of our "child" - * daemons in the routing tree will be sending us something. All we can do - * is brute force a search, though we attempt to keep it as short as possible - */ - proc.jobid = jobid; - proc.vpid = 0; - while (proc.vpid < jobdat->num_procs && 0 < opal_list_get_size(&daemon_tree)) { - ORTE_EPOCH_SET(proc.epoch,orte_ess.proc_get_epoch(&proc)); - - /* get the daemon that hosts this proc */ - daemonvpid = orte_ess.proc_get_daemon(&proc); - /* is this daemon one of our children, or at least its contribution - * will pass through one of our children - */ - item = opal_list_get_first(&daemon_tree); - while (item != opal_list_get_end(&daemon_tree)) { - next = opal_list_get_next(item); - child = (orte_routed_tree_t*)item; - if (child->vpid == daemonvpid || opal_bitmap_is_set_bit(&child->relatives, daemonvpid)) { - /* it does - add to num_participating */ - jobdat->num_participating++; - /* remove this from the list so we don't double count it */ - opal_list_remove_item(&daemon_tree, item); - /* done with search */ - break; - } - item = next; - } - proc.vpid++; - } - } - - OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, - "%s grpcomm:base:daemon_coll: daemon collective for job %s from %s type %ld" - " num_collected %d num_participating %d num_contributors %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jobid), - ORTE_NAME_PRINT(sender), - (long)jobdat->collective_type, jobdat->num_collected, - jobdat->num_participating, jobdat->num_contributors)); - - if (jobdat->num_collected == jobdat->num_participating) { - /* if I am the HNP, go process the results */ - if (ORTE_PROC_IS_HNP) { - goto hnp_process; - } - - /* if I am not the HNP, send to my parent */ - OBJ_CONSTRUCT(&buf, opal_buffer_t); - /* pack the jobid */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &jobid, 1, ORTE_JOBID))) { - ORTE_ERROR_LOG(rc); - return; - } - /* pack the target tag */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &rmltag, 1, ORTE_RML_TAG))) { - ORTE_ERROR_LOG(rc); - return; - } - /* pack the number of contributors */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &jobdat->num_contributors, 1, OPAL_INT32))) { - ORTE_ERROR_LOG(rc); - return; - } - /* xfer the payload*/ - opal_dss.copy_payload(&buf, &jobdat->collection_bucket); - /* reset everything for next collective */ - jobdat->num_contributors = 0; - jobdat->num_collected = 0; - OBJ_DESTRUCT(&jobdat->collection_bucket); - OBJ_CONSTRUCT(&jobdat->collection_bucket, opal_buffer_t); - /* send it */ - my_parent.jobid = ORTE_PROC_MY_NAME->jobid; - my_parent.vpid = orte_routed.get_routing_tree(NULL); - ORTE_EPOCH_SET(my_parent.epoch,orte_ess.proc_get_epoch(&my_parent)); - - OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, - "%s grpcomm:base:daemon_coll: daemon collective not the HNP - sending to parent %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&my_parent))); - if (0 > (rc = orte_rml.send_buffer(&my_parent, &buf, ORTE_RML_TAG_DAEMON_COLLECTIVE, 0))) { - ORTE_ERROR_LOG(rc); - return; - } - OBJ_DESTRUCT(&buf); - } - return; - -hnp_process: - OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, - "%s grpcomm:base:daemon_coll: daemon collective HNP - xcasting to job %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(jobid))); - /* setup a buffer to send the results back to the job members */ - OBJ_CONSTRUCT(&buf, opal_buffer_t); - - /* add any collected data */ - numc = jobdat->num_contributors; - if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &numc, 1, OPAL_INT32))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(&buf, &jobdat->collection_bucket))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - /* reset everything for next collective */ - jobdat->num_contributors = 0; - jobdat->num_collected = 0; - OBJ_DESTRUCT(&jobdat->collection_bucket); - OBJ_CONSTRUCT(&jobdat->collection_bucket, opal_buffer_t); - /* send the buffer */ - if (ORTE_SUCCESS != (rc = orte_grpcomm.xcast(jobid, &buf, rmltag))) { - ORTE_ERROR_LOG(rc); - } - -cleanup: - OBJ_DESTRUCT(&buf); - - return; -} - -static void process_msg(int fd, short event, void *data) -{ - orte_message_event_t *mev = (orte_message_event_t*)data; - orte_process_name_t *proc; - opal_buffer_t *buf, relay; - int32_t rc, n; - opal_list_item_t *item; - orte_odls_child_t *child; - bool found = false; - orte_odls_job_t *jobdat; - orte_rml_tag_t rmltag; - - proc = &mev->sender; - buf = mev->buffer; - - /* is the sender a local proc, or a daemon relaying the collective? */ - if (ORTE_PROC_MY_NAME->jobid == proc->jobid) { - /* this is a relay - call that code */ - orte_grpcomm_base.daemon_coll(proc, buf); - goto CLEANUP; - } - - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; - - /* find this child */ - if (OPAL_EQUAL == opal_dss.compare(proc, child->name, ORTE_NAME)) { - - OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, - "%s grpcomm:base:daemon_coll: collecting data from child %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name))); - - found = true; - break; - } - } - - /* if it wasn't found on the list, then we need to add it - must have - * come from a singleton - */ - if (!found) { - child = OBJ_NEW(orte_odls_child_t); - if (ORTE_SUCCESS != (rc = opal_dss.copy((void**)&child->name, proc, ORTE_NAME))) { - ORTE_ERROR_LOG(rc); - return; - } - opal_list_append(&orte_local_children, &child->super); - /* we don't know any other info about the child, so just indicate it's - * alive - */ - child->alive = true; - /* setup a jobdat for it */ - orte_odls_base_setup_singleton_jobdat(proc->jobid); - } - - /* this was one of our local procs - find the jobdat for this job */ - jobdat = NULL; - for (item = opal_list_get_first(&orte_local_jobdata); - item != opal_list_get_end(&orte_local_jobdata); - item = opal_list_get_next(item)) { - jobdat = (orte_odls_job_t*)item; - - /* is this the specified job? */ - if (jobdat->jobid == proc->jobid) { - break; - } - } - if (NULL == jobdat) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - rc = ORTE_ERR_NOT_FOUND; - goto CLEANUP; - } - - /* unpack the target tag */ - n = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(buf, &rmltag, &n, ORTE_RML_TAG))) { - ORTE_ERROR_LOG(rc); - goto CLEANUP; - } - - /* collect the provided data */ - opal_dss.copy_payload(&jobdat->local_collection, buf); - - /* flag this proc as having participated */ - child->coll_recvd = true; - - /* now check to see if all local procs in this job have participated */ - if (all_children_participated(proc->jobid)) { - - OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, - "%s grpcomm:base:daemon_coll: executing collective", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - /* prep a buffer to pass it all along */ - OBJ_CONSTRUCT(&relay, opal_buffer_t); - /* pack the jobid */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(&relay, &proc->jobid, 1, ORTE_JOBID))) { - ORTE_ERROR_LOG(rc); - return; - } - /* pack the target tag */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(&relay, &rmltag, 1, ORTE_RML_TAG))) { - ORTE_ERROR_LOG(rc); - return; - } - /* pack the number of contributors */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(&relay, &jobdat->num_local_procs, 1, OPAL_INT32))) { - ORTE_ERROR_LOG(rc); - return; - } - /* xfer the payload*/ - opal_dss.copy_payload(&relay, &jobdat->local_collection); - /* refresh the collection bucket for reuse */ - OBJ_DESTRUCT(&jobdat->local_collection); - OBJ_CONSTRUCT(&jobdat->local_collection, opal_buffer_t); - reset_child_participation(proc->jobid); - /* pass this to the daemon collective operation */ - orte_grpcomm_base.daemon_coll(ORTE_PROC_MY_NAME, &relay); - /* done with the relay */ - OBJ_DESTRUCT(&relay); - - OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output, - "%s grpcomm:base:daemon_coll: collective completed", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - } - -CLEANUP: - /* release the message */ - OBJ_RELEASE(mev); -} - -void orte_grpcomm_base_daemon_coll_recv(int status, orte_process_name_t* sender, - opal_buffer_t* buffer, orte_rml_tag_t tag, - void* cbdata) -{ - int rc; - - OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, - "%s grpcomm:base:daemon_coll:receive got message from %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(sender))); - - /* don't process this right away - we need to get out of the recv before - * we process the message as it may ask us to do something that involves - * more messaging! Instead, setup an event so that the message gets processed - * as soon as we leave the recv. - * - * The macro makes a copy of the buffer, which we release above - the incoming - * buffer, however, is NOT released here, although its payload IS transferred - * to the message buffer for later processing - */ - ORTE_MESSAGE_EVENT(sender, buffer, tag, process_msg); - - /* reissue the recv */ - if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, - ORTE_RML_TAG_DAEMON_COLLECTIVE, - ORTE_RML_NON_PERSISTENT, - orte_grpcomm_base_daemon_coll_recv, - cbdata))) { - ORTE_ERROR_LOG(rc); - } - return; -} diff --git a/orte/mca/grpcomm/base/grpcomm_base_modex.c b/orte/mca/grpcomm/base/grpcomm_base_modex.c index 5b663fe3d5..987ddeb8ff 100644 --- a/orte/mca/grpcomm/base/grpcomm_base_modex.c +++ b/orte/mca/grpcomm/base/grpcomm_base_modex.c @@ -12,6 +12,8 @@ * All rights reserved. * Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -46,76 +48,98 @@ #include "orte/mca/grpcomm/base/base.h" #include "orte/mca/grpcomm/grpcomm.h" -/*************** MODEX SECTION **************/ -int orte_grpcomm_base_full_modex(opal_list_t *procs) +orte_grpcomm_coll_id_t orte_grpcomm_base_get_coll_id(void) { - opal_buffer_t buf, rbuf; - int32_t i, n, num_procs; - orte_std_cntr_t cnt; - orte_process_name_t proc_name; - int rc=ORTE_SUCCESS; - orte_nid_t *nid; + orte_grpcomm_coll_id_t id; + + /* assign the next collective id */ + id = orte_grpcomm_base.coll_id; + /* rotate to the next value */ + orte_grpcomm_base.coll_id++; + return id; +} + + +/*************** MODEX SECTION **************/ +int orte_grpcomm_base_modex(orte_grpcomm_collective_t *modex) +{ + int rc; orte_local_rank_t local_rank; orte_node_rank_t node_rank; - orte_jmap_t *jmap; - orte_pmap_t *pmap; - orte_vpid_t daemon; - char *hostname; + orte_namelist_t *nm; OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output, - "%s grpcomm:base:full:modex: performing modex", + "%s grpcomm:base:modex: performing modex", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - /* setup the buffer that will actually be sent */ - OBJ_CONSTRUCT(&buf, opal_buffer_t); - OBJ_CONSTRUCT(&rbuf, opal_buffer_t); - + /* record the collective */ + modex->active = true; + modex->next_cbdata = modex; + opal_list_append(&orte_grpcomm_base.active_colls, &modex->super); + /* put our process name in the buffer so it can be unpacked later */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, ORTE_PROC_MY_NAME, 1, ORTE_NAME))) { + if (ORTE_SUCCESS != (rc = opal_dss.pack(&modex->buffer, ORTE_PROC_MY_NAME, 1, ORTE_NAME))) { ORTE_ERROR_LOG(rc); goto cleanup; } - /* pack our hostname */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &orte_process_info.nodename, 1, OPAL_STRING))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } + if (0 == opal_list_get_size(&modex->participants)) { + /* add a wildcard name to the participants so the daemon knows + * that everyone in my job must participate + */ + nm = OBJ_NEW(orte_namelist_t); + nm->name.jobid = ORTE_PROC_MY_NAME->jobid; + nm->name.vpid = ORTE_VPID_WILDCARD; + opal_list_append(&modex->participants, &nm->super); + modex->next_cb = orte_grpcomm_base_store_modex; + } else { + /* this is not amongst our peers, but rather between a select + * group of processes - e.g., during a connect/accept operation. + * Thus, this requires we send additional info + */ + modex->next_cb = orte_grpcomm_base_store_peer_modex; + + /* pack our hostname */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(&modex->buffer, &orte_process_info.nodename, 1, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } - /* pack our daemon's vpid */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &ORTE_PROC_MY_DAEMON->vpid, 1, ORTE_VPID))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } + /* pack our daemon's vpid */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(&modex->buffer, &ORTE_PROC_MY_DAEMON->vpid, 1, ORTE_VPID))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } - /* pack our node rank */ - node_rank = orte_ess.get_node_rank(ORTE_PROC_MY_NAME); - if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &node_rank, 1, ORTE_NODE_RANK))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } + /* pack our node rank */ + node_rank = orte_ess.get_node_rank(ORTE_PROC_MY_NAME); + if (ORTE_SUCCESS != (rc = opal_dss.pack(&modex->buffer, &node_rank, 1, ORTE_NODE_RANK))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } - /* pack our local rank */ - local_rank = orte_ess.get_local_rank(ORTE_PROC_MY_NAME); - if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &local_rank, 1, ORTE_LOCAL_RANK))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } + /* pack our local rank */ + local_rank = orte_ess.get_local_rank(ORTE_PROC_MY_NAME); + if (ORTE_SUCCESS != (rc = opal_dss.pack(&modex->buffer, &local_rank, 1, ORTE_LOCAL_RANK))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } #if OPAL_HAVE_HWLOC - /* pack our binding info so other procs can determine our locality */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &orte_process_info.bind_level, 1, OPAL_HWLOC_LEVEL_T))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &orte_process_info.bind_idx, 1, OPAL_UINT))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } + /* pack our binding info so other procs can determine our locality */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(&modex->buffer, &orte_process_info.bind_level, 1, OPAL_HWLOC_LEVEL_T))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + if (ORTE_SUCCESS != (rc = opal_dss.pack(&modex->buffer, &orte_process_info.bind_idx, 1, OPAL_UINT))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } #endif + } /* pack the entries we have received */ - if (ORTE_SUCCESS != (rc = orte_grpcomm_base_pack_modex_entries(&buf))) { + if (ORTE_SUCCESS != (rc = orte_grpcomm_base_pack_modex_entries(&modex->buffer))) { ORTE_ERROR_LOG(rc); goto cleanup; } @@ -124,76 +148,67 @@ int orte_grpcomm_base_full_modex(opal_list_t *procs) "%s grpcomm:base:full:modex: executing allgather", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - if (NULL == procs) { - /* exchange the buffer with my peers */ - if (ORTE_SUCCESS != (rc = orte_grpcomm.allgather(&buf, &rbuf))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - } else { - /* exchange the buffer with the list of peers */ - if (ORTE_SUCCESS != (rc = orte_grpcomm.allgather_list(procs, &buf, &rbuf))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - } - - OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output, - "%s grpcomm:base:full:modex: processing modex info", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - - - /* extract the number of procs that put data in the buffer */ - cnt=1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(&rbuf, &num_procs, &cnt, OPAL_INT32))) { + /* execute the allgather */ + if (ORTE_SUCCESS != (rc = orte_grpcomm.allgather(modex))) { ORTE_ERROR_LOG(rc); goto cleanup; } + + OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output, + "%s grpcomm:base:modex: modex posted", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, - "%s grpcomm:base:full:modex: received %ld data bytes from %d procs", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - (long)(rbuf.pack_ptr - rbuf.unpack_ptr), num_procs)); - - /* if the buffer doesn't have any more data, ignore it */ - if (0 >= (rbuf.pack_ptr - rbuf.unpack_ptr)) { - goto cleanup; - } - - /* otherwise, process it */ - for (i=0; i < num_procs; i++) { - /* unpack the process name */ - cnt=1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(&rbuf, &proc_name, &cnt, ORTE_NAME))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - + return ORTE_SUCCESS; + + cleanup: + OBJ_RELEASE(modex); + return rc; +} + +void orte_grpcomm_base_store_peer_modex(opal_buffer_t *rbuf, void *cbdata) +{ + int rc, n, cnt; + orte_process_name_t proc_name; + char *hostname; + orte_vpid_t daemon; + orte_node_rank_t node_rank; + orte_local_rank_t local_rank; + orte_nid_t *nid; + orte_jmap_t *jmap; + orte_pmap_t *pmap; + orte_grpcomm_collective_t *modex = (orte_grpcomm_collective_t*)cbdata; + + OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output, + "%s STORING PEER MODEX DATA", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + + /* unpack the process name */ + cnt=1; + while (ORTE_SUCCESS == (rc = opal_dss.unpack(rbuf, &proc_name, &cnt, ORTE_NAME))) { /* unpack the hostname */ cnt = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(&rbuf, &hostname, &cnt, OPAL_STRING))) { + if (ORTE_SUCCESS != (rc = opal_dss.unpack(rbuf, &hostname, &cnt, OPAL_STRING))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* unpack the daemon vpid */ cnt = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(&rbuf, &daemon, &cnt, ORTE_VPID))) { + if (ORTE_SUCCESS != (rc = opal_dss.unpack(rbuf, &daemon, &cnt, ORTE_VPID))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* unpack the node rank */ cnt = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(&rbuf, &node_rank, &cnt, ORTE_NODE_RANK))) { + if (ORTE_SUCCESS != (rc = opal_dss.unpack(rbuf, &node_rank, &cnt, ORTE_NODE_RANK))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* unpack the local rank */ cnt = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(&rbuf, &local_rank, &cnt, ORTE_LOCAL_RANK))) { + if (ORTE_SUCCESS != (rc = opal_dss.unpack(rbuf, &local_rank, &cnt, ORTE_LOCAL_RANK))) { ORTE_ERROR_LOG(rc); goto cleanup; } @@ -272,12 +287,12 @@ int orte_grpcomm_base_full_modex(opal_list_t *procs) /* unpack the locality info */ cnt = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(&rbuf, &bind_level, &cnt, OPAL_HWLOC_LEVEL_T))) { + if (ORTE_SUCCESS != (rc = opal_dss.unpack(rbuf, &bind_level, &cnt, OPAL_HWLOC_LEVEL_T))) { ORTE_ERROR_LOG(rc); goto cleanup; } cnt = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(&rbuf, &bind_idx, &cnt, OPAL_UINT))) { + if (ORTE_SUCCESS != (rc = opal_dss.unpack(rbuf, &bind_idx, &cnt, OPAL_UINT))) { ORTE_ERROR_LOG(rc); goto cleanup; } @@ -348,72 +363,43 @@ int orte_grpcomm_base_full_modex(opal_list_t *procs) ORTE_NAME_PRINT(&proc_name))); /* update the modex database */ - if (ORTE_SUCCESS != (rc = orte_grpcomm_base_update_modex_entries(&proc_name, &rbuf))) { + if (ORTE_SUCCESS != (rc = orte_grpcomm_base_update_modex_entries(&proc_name, rbuf))) { ORTE_ERROR_LOG(rc); goto cleanup; } - } - + } + cleanup: - OBJ_DESTRUCT(&buf); - OBJ_DESTRUCT(&rbuf); - return rc; + /* flag the collective as complete */ + modex->active = false; + /* cleanup */ + opal_list_remove_item(&orte_grpcomm_base.active_colls, &modex->super); + /* notify that the modex is complete */ + if (NULL != modex->cbfunc) { + OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output, + "%s CALLING MODEX RELEASE", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + modex->cbfunc(NULL, modex->cbdata); + } } -int orte_grpcomm_base_modex_unpack( opal_buffer_t* rbuf) +void orte_grpcomm_base_store_modex(opal_buffer_t *rbuf, void *cbdata) { - int32_t i, num_procs; orte_std_cntr_t cnt; orte_process_name_t proc_name; int rc=ORTE_SUCCESS; - orte_vpid_t daemon; - orte_pmap_t *pmap; + orte_grpcomm_collective_t *modex = (orte_grpcomm_collective_t*)cbdata; - /* process the results */ - /* extract the number of procs that put data in the buffer */ - cnt = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(rbuf, &num_procs, &cnt, OPAL_INT32))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - - OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output, - "%s grpcomm:base:modex:unpack: received %ld data bytes from %d procs", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - (long)(rbuf->pack_ptr - rbuf->unpack_ptr), num_procs)); - - /* if the buffer doesn't have any more data, ignore it */ - if (0 >= (rbuf->pack_ptr - rbuf->unpack_ptr)) { - goto cleanup; - } - - /* otherwise, process it */ - for (i = 0; i < num_procs; i++) { - /* unpack the process name */ - cnt=1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(rbuf, &proc_name, &cnt, ORTE_NAME))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } + OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output, + "%s STORING MODEX DATA", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + + /* unpack the process name */ + cnt=1; + while (ORTE_SUCCESS == (rc = opal_dss.unpack(rbuf, &proc_name, &cnt, ORTE_NAME))) { - /* SINCE THIS IS AMONGST PEERS, THERE IS NO NEED TO UPDATE THE NIDMAP/PIDMAP */ - - if (ORTE_VPID_INVALID == (daemon = orte_ess.proc_get_daemon(&proc_name))) { - /* clear problem */ - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - rc = ORTE_ERR_NOT_FOUND; - goto cleanup; - } - - if (NULL == (pmap = orte_util_lookup_pmap(&proc_name))) { - /* clear problem */ - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - rc = ORTE_ERR_NOT_FOUND; - goto cleanup; - } - OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, - "%s grpcomm:base:modex:unpack: adding modex entry for proc %s", + "%s grpcomm:base:store_modex adding modex entry for proc %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc_name))); @@ -423,9 +409,22 @@ int orte_grpcomm_base_modex_unpack( opal_buffer_t* rbuf) goto cleanup; } } + if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) { + ORTE_ERROR_LOG(rc); + } cleanup: - return rc; + /* flag the modex as complete */ + modex->active = false; + /* cleanup */ + opal_list_remove_item(&orte_grpcomm_base.active_colls, &modex->super); + /* execute user callback, if requested */ + if (NULL != modex->cbfunc) { + OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output, + "%s CALLING MODEX RELEASE", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + modex->cbfunc(NULL, modex->cbdata); + } } /** diff --git a/orte/mca/grpcomm/base/grpcomm_base_open.c b/orte/mca/grpcomm/base/grpcomm_base_open.c index 688b223c5d..e1f4de594c 100644 --- a/orte/mca/grpcomm/base/grpcomm_base_open.c +++ b/orte/mca/grpcomm/base/grpcomm_base_open.c @@ -9,6 +9,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -53,12 +55,9 @@ int orte_grpcomm_base_open(void) verbose set by the mca open system... */ orte_grpcomm_base.output = opal_output_open(NULL); - /* define the default daemon collective fn */ -#if ORTE_DISABLE_FULL_SUPPORT - orte_grpcomm_base.daemon_coll = NULL; -#else - orte_grpcomm_base.daemon_coll = orte_grpcomm_base_daemon_collective; -#endif + /* init globals */ + OBJ_CONSTRUCT(&orte_grpcomm_base.active_colls, opal_list_t); + orte_grpcomm_base.coll_id = 0; #if OPAL_HAVE_HWLOC orte_grpcomm_base.working_cpuset = NULL; @@ -78,21 +77,64 @@ int orte_grpcomm_base_open(void) return ORTE_SUCCESS; } +orte_grpcomm_collective_t* orte_grpcomm_base_setup_collective(orte_grpcomm_coll_id_t id) +{ + opal_list_item_t *item; + orte_grpcomm_collective_t *cptr, *coll; + + coll = NULL; + for (item = opal_list_get_first(&orte_grpcomm_base.active_colls); + item != opal_list_get_end(&orte_grpcomm_base.active_colls); + item = opal_list_get_next(item)) { + cptr = (orte_grpcomm_collective_t*)item; + if (id == cptr->id) { + coll = cptr; + break; + } + } + if (NULL == coll) { + coll = OBJ_NEW(orte_grpcomm_collective_t); + coll->id = id; + opal_list_append(&orte_grpcomm_base.active_colls, &coll->super); + } + + return coll; +} + /* local objects */ static void collective_constructor(orte_grpcomm_collective_t *ptr) { - OBJ_CONSTRUCT(&ptr->lock, opal_mutex_t); - OBJ_CONSTRUCT(&ptr->cond, opal_condition_t); - OBJ_CONSTRUCT(&ptr->results, opal_buffer_t); - ptr->recvd = 0; + ptr->id = -1; + ptr->active = false; + ptr->num_local_recvd = 0; + OBJ_CONSTRUCT(&ptr->local_bucket, opal_buffer_t); + ptr->num_peer_buckets = 0; + ptr->num_global_recvd = 0; + ptr->locally_complete = false; + OBJ_CONSTRUCT(&ptr->participants, opal_list_t); + ptr->cbfunc = NULL; + ptr->cbdata = NULL; + OBJ_CONSTRUCT(&ptr->buffer, opal_buffer_t); + OBJ_CONSTRUCT(&ptr->targets, opal_list_t); + ptr->next_cb = NULL; + ptr->next_cbdata = NULL; } static void collective_destructor(orte_grpcomm_collective_t *ptr) { - OBJ_DESTRUCT(&ptr->lock); - OBJ_DESTRUCT(&ptr->cond); - OBJ_DESTRUCT(&ptr->results); + opal_list_item_t *item; + + OBJ_DESTRUCT(&ptr->local_bucket); + while (NULL != (item = opal_list_remove_first(&ptr->participants))) { + OBJ_RELEASE(item); + } + OBJ_DESTRUCT(&ptr->participants); + OBJ_DESTRUCT(&ptr->buffer); + while (NULL != (item = opal_list_remove_first(&ptr->targets))) { + OBJ_RELEASE(item); + } + OBJ_DESTRUCT(&ptr->targets); } OBJ_CLASS_INSTANCE(orte_grpcomm_collective_t, - opal_object_t, + opal_list_item_t, collective_constructor, collective_destructor); diff --git a/orte/mca/grpcomm/base/grpcomm_base_receive.c b/orte/mca/grpcomm/base/grpcomm_base_receive.c new file mode 100644 index 0000000000..ac80c2d2a4 --- /dev/null +++ b/orte/mca/grpcomm/base/grpcomm_base_receive.c @@ -0,0 +1,686 @@ +/* -*- C -*- + * + * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** @file: + * + */ + +/* + * includes + */ +#include "orte_config.h" + + +#include "opal/dss/dss.h" + +#include "orte/util/proc_info.h" +#include "orte/util/error_strings.h" +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/ess/ess.h" +#include "orte/mca/odls/base/base.h" +#include "orte/mca/rml/rml.h" +#include "orte/mca/routed/routed.h" +#include "orte/mca/state/state.h" +#include "orte/util/name_fns.h" +#include "orte/runtime/orte_globals.h" + +#include "orte/mca/grpcomm/grpcomm_types.h" +#include "orte/mca/grpcomm/grpcomm.h" +#include "orte/mca/grpcomm/base/base.h" + +static bool recv_issued=false; +static void daemon_local_recv(int status, orte_process_name_t* sender, + opal_buffer_t* buffer, orte_rml_tag_t tag, + void* cbdata); +static void daemon_coll_recv(int status, orte_process_name_t* sender, + opal_buffer_t* buffer, orte_rml_tag_t tag, + void* cbdata); +static void app_recv(int status, orte_process_name_t* sender, + opal_buffer_t* buffer, orte_rml_tag_t tag, + void* cbdata); +static void coll_id_req(int status, orte_process_name_t* sender, + opal_buffer_t* buffer, orte_rml_tag_t tag, + void* cbdata); + +int orte_grpcomm_base_comm_start(void) +{ + int rc; + + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, + "%s grpcomm:base:receive start comm", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + + if (!recv_issued) { + if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON) { + if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, + ORTE_RML_TAG_COLLECTIVE, + ORTE_RML_PERSISTENT, + daemon_local_recv, NULL))) { + ORTE_ERROR_LOG(rc); + recv_issued = false; + return rc; + } + if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, + ORTE_RML_TAG_XCAST, + ORTE_RML_PERSISTENT, + orte_grpcomm_base_xcast_recv, NULL))) { + ORTE_ERROR_LOG(rc); + recv_issued = false; + return rc; + } + if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, + ORTE_RML_TAG_DAEMON_COLL, + ORTE_RML_PERSISTENT, + daemon_coll_recv, NULL))) { + ORTE_ERROR_LOG(rc); + recv_issued = false; + return rc; + } + if (ORTE_PROC_IS_HNP) { + if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, + ORTE_RML_TAG_COLL_ID_REQ, + ORTE_RML_PERSISTENT, + coll_id_req, NULL))) { + ORTE_ERROR_LOG(rc); + recv_issued = false; + return rc; + } + } + recv_issued = true; + } else if (ORTE_PROC_IS_APP) { + if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, + ORTE_RML_TAG_COLLECTIVE, + ORTE_RML_PERSISTENT, + app_recv, NULL))) { + ORTE_ERROR_LOG(rc); + recv_issued = false; + return rc; + } + recv_issued = true; + } + } + + return ORTE_SUCCESS; +} + + +void orte_grpcomm_base_comm_stop(void) +{ + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, + "%s grpcomm:base:receive stop comm", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + if (recv_issued) { + orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_COLLECTIVE); + if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON) { + orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_XCAST); + orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON_COLL); + } + if (ORTE_PROC_IS_HNP) { + orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_COLL_ID_REQ); + } + recv_issued = false; + } +} + +static void coll_id_req(int status, orte_process_name_t* sender, + opal_buffer_t* buffer, orte_rml_tag_t tag, + void* cbdata) +{ + orte_grpcomm_coll_id_t id; + opal_buffer_t *relay; + int rc; + /* collective - only the HNP ever gets this message, but check + * in case a developer makes a mistake! + */ + id = orte_grpcomm_base_get_coll_id(); + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, + "%s grpcomm:base:receive proc %s requested coll id - returned id %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(sender), id)); + relay = OBJ_NEW(opal_buffer_t); + if (ORTE_SUCCESS != (rc = opal_dss.pack(relay, &id, 1, ORTE_GRPCOMM_COLL_ID_T))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(relay); + return; + } + if (0 > (rc = orte_rml.send_buffer_nb(sender, relay, ORTE_RML_TAG_COLL_ID, 0, + orte_rml_send_callback, NULL))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(relay); + return; + } +} + + +/* process incoming coll returns */ +static void app_recv(int status, orte_process_name_t* sender, + opal_buffer_t* buffer, orte_rml_tag_t tag, + void* cbdata) +{ + orte_grpcomm_collective_t *coll; + opal_list_item_t *item; + int n, rc; + orte_grpcomm_coll_id_t id; + + /* get the collective id */ + n = 1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &id, &n, ORTE_GRPCOMM_COLL_ID_T))) { + ORTE_ERROR_LOG(rc); + return; + } + + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, + "%s grpcomm:base:receive processing collective return for id %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), id)); + + /* search my list of active collectives */ + for (item = opal_list_get_first(&orte_grpcomm_base.active_colls); + item != opal_list_get_end(&orte_grpcomm_base.active_colls); + item = opal_list_get_next(item)) { + coll = (orte_grpcomm_collective_t*)item; + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, + "%s CHECKING COLL id %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + coll->id)); + + if (id == coll->id) { + /* see if the collective needs another step */ + if (NULL != coll->next_cb) { + /* have to go here next */ + coll->next_cb(buffer, coll->next_cbdata); + break; + } + /* flag the collective as complete */ + coll->active = false; + /* cleanup */ + opal_list_remove_item(&orte_grpcomm_base.active_colls, item); + /* callback the specified function */ + if (NULL != coll->cbfunc) { + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, + "%s grpcomm:base:receive executing callback", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + + coll->cbfunc(buffer, coll->cbdata); + } + break; + } + } +} + +/**** DAEMON COLLECTIVE SUPPORT ****/ +/* recv for collective messages sent from a daemon's local procs */ +static void daemon_local_recv(int status, orte_process_name_t* sender, + opal_buffer_t* buffer, orte_rml_tag_t tag, + void* cbdata) +{ + int32_t rc, n; + orte_vpid_t nprocs; + orte_job_t *jdata; + orte_grpcomm_collective_t *coll; + orte_process_name_t proc; + orte_namelist_t *nm; + bool keep; + orte_vpid_t i; + orte_grpcomm_coll_id_t id; + bool do_progress=true; + + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, + "%s COLLECTIVE RECVD FROM %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(sender))); + + /* unpack the collective id */ + n = 1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &id, &n, ORTE_GRPCOMM_COLL_ID_T))) { + ORTE_ERROR_LOG(rc); + return; + } + + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, + "%s WORKING COLLECTIVE %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), id)); + + /* setup the collective for this id - if it's already present, + * then this will just return the existing structure + */ + coll = orte_grpcomm_base_setup_collective(id); + + /* record this proc's participation and its data */ + coll->num_local_recvd++; + + /* unpack the number of participants */ + n = 1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &nprocs, &n, ORTE_VPID))) { + ORTE_ERROR_LOG(rc); + return; + } + + /* do we already have the names of all participants in this collective */ + keep = true; + if (0 < opal_list_get_size(&coll->participants)) { + /* we already have it, so don't bother saving the data */ + keep = false; + } + + /* even if we don't need the names, we still have to + * unpack them to get to the data + */ + for (i=0; i < nprocs; i++) { + /* unpack the name of this participant */ + n = 1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &proc, &n, ORTE_NAME))) { + ORTE_ERROR_LOG(rc); + return; + } + if (keep) { + /* add the name to the list */ + nm = OBJ_NEW(orte_namelist_t); + nm->name.jobid = proc.jobid; + nm->name.vpid = proc.vpid; + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, + "%s ADDING %s TO PARTICIPANTS", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&proc))); + opal_list_append(&coll->participants, &nm->super); + } + /* find this job */ + if (NULL == (jdata = orte_get_job_data_object(proc.jobid))) { + /* if we can't find it, then we haven't processed the + * launch msg for this job yet - can't happen with + * our own local procs, but this could involve a proc + * running remotely that we don't know about yet + */ + do_progress = false; + } + } + + /* what remains in the buffer is solely the data payload, so + * add it to the collective + */ + opal_dss.copy_payload(&coll->local_bucket, buffer); + + /* if all involved jobs are known, then progress collectives */ + if (do_progress) { + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, + "%s PROGRESSING COLLECTIVE %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), id)); + orte_grpcomm_base_progress_collectives(); + } +} + +void orte_grpcomm_base_pack_collective(opal_buffer_t *relay, + orte_grpcomm_collective_t *coll, + orte_grpcomm_internal_stage_t stg) +{ + orte_vpid_t nprocs; + orte_namelist_t *nm; + opal_list_item_t *itm; + + opal_dss.pack(relay, &coll->id, 1, ORTE_GRPCOMM_COLL_ID_T); + nprocs = opal_list_get_size(&coll->participants); + opal_dss.pack(relay, &nprocs, 1, ORTE_VPID); + if (0 < nprocs) { + for (itm = opal_list_get_first(&coll->participants); + itm != opal_list_get_end(&coll->participants); + itm = opal_list_get_next(itm)) { + nm = (orte_namelist_t*)itm; + opal_dss.pack(relay, &nm->name, 1, ORTE_NAME); + } + } + if (ORTE_GRPCOMM_INTERNAL_STG_LOCAL == stg) { + opal_dss.pack(relay, &coll->num_local_recvd, 1, ORTE_VPID); + opal_dss.copy_payload(relay, &coll->local_bucket); + } else if (ORTE_GRPCOMM_INTERNAL_STG_APP == stg) { + opal_dss.copy_payload(relay, &coll->buffer); + } else if (ORTE_GRPCOMM_INTERNAL_STG_GLOBAL == stg) { + opal_dss.pack(relay, &coll->num_global_recvd, 1, ORTE_VPID); + opal_dss.copy_payload(relay, &coll->buffer); + } else { + ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); + } +} + + +void orte_grpcomm_base_progress_collectives(void) +{ + opal_list_item_t *item, *itm; + orte_grpcomm_collective_t *coll; + orte_namelist_t *nm; + orte_job_t *jdata; + orte_vpid_t nlp, vpid; + opal_buffer_t *relay; + int rc; + + /* cycle thru all known collectives - any collective on the list + * must have come from either a local proc or receiving a global + * collective. Either way, the number of required recipients + * should have been set + */ + item = opal_list_get_first(&orte_grpcomm_base.active_colls); + while (item != opal_list_get_end(&orte_grpcomm_base.active_colls)) { + coll = (orte_grpcomm_collective_t*)item; + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, + "%s PROGRESSING COLL id %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + coll->id)); + /* if this collective is already locally complete, then ignore it */ + if (coll->locally_complete) { + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, + "%s COLL %d IS LOCALLY COMPLETE", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + coll->id)); + goto next_coll; + } + /* setup to count number of local participants */ + nlp = 0; + /* check all participants */ + for (itm = opal_list_get_first(&coll->participants); + itm != opal_list_get_end(&coll->participants); + itm = opal_list_get_next(itm)) { + nm = (orte_namelist_t*)itm; + /* get the job object for this participant */ + if (NULL == (jdata = orte_get_job_data_object(nm->name.jobid))) { + /* if the job object isn't found, then we can't progress + * this collective + */ + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, + "%s COLL %d JOBID %s NOT FOUND", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + coll->id, ORTE_JOBID_PRINT(nm->name.jobid))); + goto next_coll; + } + /* if the job object is found, then we know about this + * job - count its local participants + */ + if (ORTE_VPID_WILDCARD == nm->name.vpid) { + /* all local procs from this job are required to participate */ + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, + "%s ALL LOCAL PROCS CONTRIBUTE %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)jdata->num_local_procs)); + nlp += jdata->num_local_procs; + } else { + /* see if this is a local proc */ + if (ORTE_VPID_INVALID == (vpid = orte_ess.proc_get_daemon(&nm->name))) { + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, + "%s COLL %d VPID %s NONLOCAL", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + coll->id, ORTE_VPID_PRINT(nm->name.vpid))); + continue; + } + if (vpid == ORTE_PROC_MY_NAME->vpid) { + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, + "%s grpcomm:prog:collectives Counting %s as local participant", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&nm->name))); + nlp++; + } + } + } + /* see if all reqd participants are done */ + if (nlp == coll->num_local_recvd) { + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, + "%s COLLECTIVE %d LOCALLY COMPLETE - SENDING TO GLOBAL COLLECTIVE", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), coll->id)); + /* mark it as locally complete */ + coll->locally_complete = true; + /* pack the collective */ + relay = OBJ_NEW(opal_buffer_t); + orte_grpcomm_base_pack_collective(relay, coll, ORTE_GRPCOMM_INTERNAL_STG_LOCAL); + /* send it to our global collective handler */ + if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_NAME, relay, + ORTE_RML_TAG_DAEMON_COLL, 0, + orte_rml_send_callback, NULL))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(relay); + } + } + + next_coll: + item = opal_list_get_next(item); + } +} + +static void daemon_coll_recv(int status, orte_process_name_t* sender, + opal_buffer_t* data, orte_rml_tag_t tag, + void* cbdata) +{ + orte_job_t *jdata; + orte_std_cntr_t n; + opal_list_item_t *item; + orte_vpid_t np, nprocs, total_local_np; + int rc; + orte_grpcomm_collective_t *coll; + orte_namelist_t *nm; + orte_grpcomm_coll_id_t id; + bool keep, do_progress; + orte_process_name_t proc; + opal_buffer_t *relay; + + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, + "%s grpcomm:base:daemon_coll: daemon collective recvd from %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(sender))); + + /* get the collective id */ + n = 1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &id, &n, ORTE_GRPCOMM_COLL_ID_T))) { + ORTE_ERROR_LOG(rc); + return; + } + + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, + "%s grpcomm:base:daemon_coll: WORKING COLLECTIVE %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), id)); + + /* setup the collective for this id - if it's already present, + * then this will just return the existing structure + */ + coll = orte_grpcomm_base_setup_collective(id); + + /* record that we received a bucket */ + coll->num_peer_buckets++; + + /* unpack the number of procs involved */ + n = 1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &nprocs, &n, ORTE_VPID))) { + ORTE_ERROR_LOG(rc); + return; + } + + /* do we need to keep the participants? */ + keep = true; + if (0 < opal_list_get_size(&coll->participants)) { + /* already have it */ + keep = false; + } + + do_progress = true; + total_local_np = 0; + for (np=0; np < nprocs; np++) { + /* unpack the name of this participant */ + n = 1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &proc, &n, ORTE_NAME))) { + ORTE_ERROR_LOG(rc); + return; + } + if (keep) { + /* add the name to the list */ + nm = OBJ_NEW(orte_namelist_t); + nm->name.jobid = proc.jobid; + nm->name.vpid = proc.vpid; + opal_list_append(&coll->participants, &nm->super); + } + /* find this job */ + if (NULL == (jdata = orte_get_job_data_object(proc.jobid))) { + /* if we can't find it, then we haven't processed the + * launch msg for this job yet - can't happen with + * our own local procs, but this could involve a proc + * running remotely that we don't know about yet + */ + do_progress = false; + } + total_local_np += jdata->num_local_procs; + } + if (do_progress && 0 == total_local_np) { + coll->locally_complete = true; + } + + /* unpack the number of contributors involved in the incoming data */ + n = 1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &np, &n, ORTE_VPID))) { + ORTE_ERROR_LOG(rc); + return; + } + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, + "%s grpcomm:base:daemon_coll: NUM CONTRIBS: %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_VPID_PRINT(np))); + /* add it to the number of global recvd */ + coll->num_global_recvd += np; + + /* transfer the data */ + opal_dss.copy_payload(&coll->buffer, data); + + /* are we done? */ + if (!do_progress || !coll->locally_complete) { + /* can't continue - missing at least one launch msg + * or not locally complete + */ + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, + "%s grpcomm:base:daemon_coll: CANNOT PROGRESS", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + return; + } + + /* determine how many buckets we should receive from others + * involved in this collective - need to know the number + * of total contributors from all buckets being relayed + * thru us + */ + orte_routed.get_routing_list(ORTE_GRPCOMM_COLL_PEERS, coll); + np = 1; /* account for our own bucket */ + while (NULL != (item = opal_list_remove_first(&coll->targets))) { + nm = (orte_namelist_t*)item; + if (ORTE_VPID_WILDCARD == nm->name.vpid) { + /* wait for input from all daemons */ + np = orte_process_info.num_procs; + break; + } else { + np++; + } + } + /* clear the list for reuse */ + while (NULL != (nm = (orte_namelist_t*)opal_list_remove_first(&coll->targets))) { + OBJ_RELEASE(nm); + } + + /* relay the data, if required */ + if (np == coll->num_peer_buckets) { + orte_routed.get_routing_list(ORTE_GRPCOMM_COLL_RELAY, coll); + + while (NULL != (nm = (orte_namelist_t*)opal_list_remove_first(&coll->targets))) { + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, + "%s grpcomm:base:daemon_coll: RELAYING COLLECTIVE TO %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&nm->name))); + relay = OBJ_NEW(opal_buffer_t); + orte_grpcomm_base_pack_collective(relay, coll, ORTE_GRPCOMM_INTERNAL_STG_GLOBAL); + if (ORTE_VPID_WILDCARD == nm->name.vpid) { + /* this is going to everyone in this job, so use xcast */ + orte_grpcomm.xcast(nm->name.jobid, relay, ORTE_RML_TAG_DAEMON_COLL); + OBJ_RELEASE(relay); + } + /* otherwise, send to each member, but don't send it back to the + * sender as that can create an infinite loop + */ + if (nm->name.vpid == sender->vpid) { + OBJ_RELEASE(relay); + } else { + if (0 > orte_rml.send_buffer_nb(&nm->name, relay, ORTE_RML_TAG_DAEMON_COLL, 0, + orte_rml_send_callback, NULL)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_RELEASE(relay); + } + } + OBJ_RELEASE(nm); + } + } + /* clear the list for reuse */ + while (NULL != (nm = (orte_namelist_t*)opal_list_remove_first(&coll->targets))) { + OBJ_RELEASE(nm); + } + + /* determine how many contributors we need to recv - we know + * that all job objects were found, so we can skip that test + * while counting + */ + np = 0; + for (item = opal_list_get_first(&coll->participants); + item != opal_list_get_end(&coll->participants); + item = opal_list_get_next(item)) { + nm = (orte_namelist_t*)item; + /* get the job object for this participant */ + jdata = orte_get_job_data_object(nm->name.jobid); + if (ORTE_VPID_WILDCARD == nm->name.vpid) { + /* all procs from this job are required to participate */ + np += jdata->num_procs; + } else { + np++; + } + } + + /* are we done? */ + if (np != coll->num_global_recvd) { + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, + "%s grpcomm:base:daemon_coll: MISSING CONTRIBUTORS: np %s ngr %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_VPID_PRINT(np), + ORTE_VPID_PRINT(coll->num_global_recvd))); + return; + } + + /* find out where, if anywhere, to send the results */ + orte_routed.get_routing_list(ORTE_GRPCOMM_COLL_COMPLETE, coll); + + /* pass the result */ + while (NULL != (item = opal_list_remove_first(&coll->targets))) { + nm = (orte_namelist_t*)item; + relay = OBJ_NEW(opal_buffer_t); + opal_dss.pack(relay, &coll->id, 1, ORTE_GRPCOMM_COLL_ID_T); + opal_dss.copy_payload(relay, &coll->buffer); + if (ORTE_VPID_WILDCARD == nm->name.vpid) { + /* all procs from this job get it */ + orte_grpcomm.xcast(nm->name.jobid, relay, ORTE_RML_TAG_COLLECTIVE); + OBJ_RELEASE(relay); + } else { + /* send it to this proc */ + if (0 > orte_rml.send_buffer_nb(&nm->name, relay, ORTE_RML_TAG_COLLECTIVE, 0, + orte_rml_send_callback, NULL)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_RELEASE(relay); + } + } + OBJ_RELEASE(nm); + } + + /* remove this collective */ + opal_list_remove_item(&orte_grpcomm_base.active_colls, &coll->super); + OBJ_RELEASE(coll); +} diff --git a/orte/mca/grpcomm/base/grpcomm_base_xcast.c b/orte/mca/grpcomm/base/grpcomm_base_xcast.c new file mode 100644 index 0000000000..5c1f1624e7 --- /dev/null +++ b/orte/mca/grpcomm/base/grpcomm_base_xcast.c @@ -0,0 +1,221 @@ +/* -*- C -*- + * + * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** @file: + * + */ + +/* + * includes + */ +#include "orte_config.h" + + +#include "opal/dss/dss.h" + +#include "orte/util/proc_info.h" +#include "orte/util/error_strings.h" +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/ess/ess.h" +#include "orte/mca/odls/base/base.h" +#include "orte/mca/rml/rml.h" +#include "orte/mca/routed/routed.h" +#include "orte/util/name_fns.h" +#include "orte/runtime/orte_globals.h" + +#include "orte/mca/grpcomm/grpcomm_types.h" +#include "orte/mca/grpcomm/grpcomm.h" +#include "orte/mca/grpcomm/base/base.h" + +void orte_grpcomm_base_xcast_recv(int status, orte_process_name_t* sender, + opal_buffer_t* buffer, orte_rml_tag_t tag, + void* cbdata) +{ + opal_list_item_t *item; + orte_namelist_t *nm; + int ret, cnt; + opal_buffer_t *relay; + orte_daemon_cmd_flag_t command; + opal_buffer_t wireup; + opal_byte_object_t *bo; + int8_t flag; + orte_grpcomm_collective_t coll; + + OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output, + "%s grpcomm:xcast:recv:send_relay", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + + /* setup the relay message */ + relay = OBJ_NEW(opal_buffer_t); + opal_dss.copy_payload(relay, buffer); + + /* peek at the command */ + cnt=1; + if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &command, &cnt, ORTE_DAEMON_CMD))) { + ORTE_ERROR_LOG(ret); + goto relay; + } + + /* if it is add_procs, then... */ + if (ORTE_DAEMON_ADD_LOCAL_PROCS == command) { + /* extract the byte object holding the daemonmap */ + cnt=1; + if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &bo, &cnt, OPAL_BYTE_OBJECT))) { + ORTE_ERROR_LOG(ret); + goto relay; + } + + /* update our local nidmap, if required - the decode function + * knows what to do - it will also free the bytes in the bo + */ + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, + "%s grpcomm:base:xcast updating nidmap", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + + if (ORTE_SUCCESS != (ret = orte_ess.update_nidmap(bo))) { + ORTE_ERROR_LOG(ret); + goto relay; + } + /* update the routing plan */ + orte_routed.update_routing_plan(); + + /* see if we have wiring info as well */ + cnt=1; + if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &flag, &cnt, OPAL_INT8))) { + ORTE_ERROR_LOG(ret); + goto relay; + } + if (0 == flag) { + /* no - just return */ + goto relay; + } + + /* unpack the byte object */ + cnt=1; + if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &bo, &cnt, OPAL_BYTE_OBJECT))) { + ORTE_ERROR_LOG(ret); + goto relay; + } + if (0 < bo->size) { + /* load it into a buffer */ + OBJ_CONSTRUCT(&wireup, opal_buffer_t); + opal_dss.load(&wireup, bo->bytes, bo->size); + /* pass it for processing */ + if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, &wireup))) { + ORTE_ERROR_LOG(ret); + OBJ_DESTRUCT(&wireup); + goto relay; + } + /* done with the wireup buffer - dump it */ + OBJ_DESTRUCT(&wireup); + } + } + + relay: + /* setup the relay list */ + OBJ_CONSTRUCT(&coll, orte_grpcomm_collective_t); + + /* get the list of next recipients from the routed module */ + orte_routed.get_routing_list(ORTE_GRPCOMM_XCAST, &coll); + + /* if list is empty, no relay is required */ + if (opal_list_is_empty(&coll.targets)) { + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, + "%s orte:daemon:send_relay - recipient list is empty!", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + goto CLEANUP; + } + + /* send the message to each recipient on list, deconstructing it as we go */ + while (NULL != (item = opal_list_remove_first(&coll.targets))) { + nm = (orte_namelist_t*)item; + + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, + "%s orte:daemon:send_relay sending relay msg to %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&nm->name))); + OBJ_RETAIN(relay); + if (0 > (ret = orte_rml.send_buffer_nb(&nm->name, relay, ORTE_RML_TAG_XCAST, 0, + orte_rml_send_callback, NULL))) { + ORTE_ERROR_LOG(ret); + OBJ_RELEASE(relay); + continue; + } + } + + CLEANUP: + /* cleanup */ + OBJ_DESTRUCT(&coll); + + /* now send it to myself for processing */ + if (0 > (ret = orte_rml.send_buffer_nb(ORTE_PROC_MY_NAME, relay, + ORTE_RML_TAG_DAEMON, 0, + orte_rml_send_callback, NULL))) { + ORTE_ERROR_LOG(ret); + OBJ_RELEASE(relay); + } +} + +int orte_grpcomm_base_pack_xcast(orte_jobid_t job, + opal_buffer_t *buffer, + opal_buffer_t *message, + orte_rml_tag_t tag) +{ + orte_daemon_cmd_flag_t command; + int rc; + + /* if this isn't intended for the daemon command tag, then we better + * tell the daemon to deliver it to the procs, and what job is supposed + * to get it - this occurs when a caller just wants to send something + * to all the procs in a job. In that use-case, the caller doesn't know + * anything about inserting daemon commands or what routing algo might + * be used, so we have to help them out a little. Functions that are + * sending commands to the daemons themselves are smart enough to know + * what they need to do. + */ + if (ORTE_RML_TAG_DAEMON != tag) { + command = ORTE_DAEMON_MESSAGE_LOCAL_PROCS; + if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &command, 1, ORTE_DAEMON_CMD))) { + ORTE_ERROR_LOG(rc); + goto CLEANUP; + } + if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &job, 1, ORTE_JOBID))) { + ORTE_ERROR_LOG(rc); + goto CLEANUP; + } + if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &tag, 1, ORTE_RML_TAG))) { + ORTE_ERROR_LOG(rc); + goto CLEANUP; + } + } + + /* copy the payload into the new buffer - this is non-destructive, so our + * caller is still responsible for releasing any memory in the buffer they + * gave to us + */ + if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(buffer, message))) { + ORTE_ERROR_LOG(rc); + goto CLEANUP; + } + +CLEANUP: + return ORTE_SUCCESS; +} + diff --git a/orte/mca/grpcomm/cnos/grpcomm_cnos_module.c b/orte/mca/grpcomm/cnos/grpcomm_cnos_module.c index 0a73964799..710245599b 100644 --- a/orte/mca/grpcomm/cnos/grpcomm_cnos_module.c +++ b/orte/mca/grpcomm/cnos/grpcomm_cnos_module.c @@ -32,6 +32,7 @@ #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/rml/rml_types.h" +#include "orte/mca/grpcomm/grpcomm_types.h" #include "grpcomm_cnos.h" #if OMPI_GRPCOMM_CNOS_HAVE_BARRIER @@ -50,11 +51,9 @@ static int xcast(orte_jobid_t job, opal_buffer_t *buffer, orte_rml_tag_t tag); -static int orte_grpcomm_cnos_barrier(void); +static int orte_grpcomm_cnos_barrier(orte_grpcomm_collective_t *coll); -static int allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf); - -static int allgather_list(opal_list_t *names, opal_buffer_t *sbuf, opal_buffer_t *rbuf); +static int allgather(orte_grpcomm_collective_t *coll); static int set_proc_attr(const char *attr_name, const void *data, @@ -64,7 +63,7 @@ static int get_proc_attr(const orte_process_name_t proc, const char * attribute_name, void **val, size_t *size); -static int modex(opal_list_t *procs); +static int modex(orte_grpcomm_collective_t *coll); static int purge_proc_attrs(void); @@ -73,7 +72,6 @@ orte_grpcomm_base_module_t orte_grpcomm_cnos_module = { finalize, xcast, allgather, - allgather_list, orte_grpcomm_cnos_barrier, set_proc_attr, get_proc_attr, @@ -113,37 +111,35 @@ static int xcast(orte_jobid_t job, } static int -orte_grpcomm_cnos_barrier(void) +orte_grpcomm_cnos_barrier(orte_grpcomm_collective_t *coll) { #if OMPI_GRPCOMM_CNOS_HAVE_BARRIER cnos_barrier(); #endif - + coll->active = false; + if (NULL != coll->cbfunc) { + coll->cbfunc(NULL, coll->cbdata); + } return ORTE_SUCCESS; } -static int allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf) +static int allgather(orte_grpcomm_collective_t *coll) { int rc; orte_std_cntr_t zero=0; - - /* seed the outgoing buffer with num_procs=0 so it won't be unpacked */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(rbuf, &zero, 1, ORTE_STD_CNTR))) { - ORTE_ERROR_LOG(rc); - return rc; - } - return rc; -} + opal_buffer_t rbuf; -static int allgather_list(opal_list_t *names, opal_buffer_t *sbuf, opal_buffer_t *rbuf) -{ - int rc; - orte_std_cntr_t zero=0; - - /* seed the outgoing buffer with num_procs=0 so it won't be unpacked */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(rbuf, &zero, 1, ORTE_STD_CNTR))) { - ORTE_ERROR_LOG(rc); - return rc; + coll->active = false; + if (NULL != coll->cbfunc) { + /* seed the outgoing buffer with num_procs=0 so it won't be unpacked */ + OBJ_CONSTRUCT(&rbuf, opal_buffer_t); + if (ORTE_SUCCESS != (rc = opal_dss.pack(&rbuf, &zero, 1, ORTE_STD_CNTR))) { + ORTE_ERROR_LOG(rc); + OBJ_DESTRUCT(&rbuf); + return rc; + } + coll->cbfunc(&rbuf, coll->cbdata); + OBJ_DESTRUCT(&rbuf); } return rc; } @@ -164,8 +160,12 @@ static int get_proc_attr(const orte_process_name_t proc, return ORTE_ERR_NOT_IMPLEMENTED; } -static int modex(opal_list_t *procs) +static int modex(orte_grpcomm_collective_t *coll) { + modex->active = false; + if (NULL != coll->cbfunc) { + coll->cbfunc(NULL, coll->cbdata); + } return ORTE_SUCCESS; } diff --git a/orte/mca/grpcomm/grpcomm.h b/orte/mca/grpcomm/grpcomm.h index 9d6b4ac9f1..ab261057d1 100644 --- a/orte/mca/grpcomm/grpcomm.h +++ b/orte/mca/grpcomm/grpcomm.h @@ -9,6 +9,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -63,13 +65,10 @@ typedef int (*orte_grpcomm_base_module_xcast_fn_t)(orte_jobid_t job, orte_rml_tag_t tag); /* allgather - gather data from all procs */ -typedef int (*orte_grpcomm_base_module_allgather_fn_t)(opal_buffer_t *sbuf, opal_buffer_t *rbuf); - -typedef int (*orte_grpcomm_base_module_allgather_list_fn_t)(opal_list_t *names, - opal_buffer_t *sbuf, opal_buffer_t *rbuf); +typedef int (*orte_grpcomm_base_module_allgather_fn_t)(orte_grpcomm_collective_t *coll); /* barrier function */ -typedef int (*orte_grpcomm_base_module_barrier_fn_t)(void); +typedef int (*orte_grpcomm_base_module_barrier_fn_t)(orte_grpcomm_collective_t *coll); /** DATA EXCHANGE FUNCTIONS - SEE ompi/runtime/ompi_module_exchange.h FOR A DESCRIPTION @@ -86,12 +85,11 @@ typedef int (*orte_grpcomm_base_module_modex_get_proc_attr_fn_t)(const orte_proc void **buffer, size_t *size); /* perform a modex operation */ -typedef int (*orte_grpcomm_base_module_modex_fn_t)(opal_list_t *procs); +typedef int (*orte_grpcomm_base_module_modex_fn_t)(orte_grpcomm_collective_t *coll); /* purge the internal attr table */ typedef int (*orte_grpcomm_base_module_purge_proc_attrs_fn_t)(void); - /* * Ver 2.0 */ @@ -101,7 +99,6 @@ struct orte_grpcomm_base_module_2_0_0_t { /* collective operations */ orte_grpcomm_base_module_xcast_fn_t xcast; orte_grpcomm_base_module_allgather_fn_t allgather; - orte_grpcomm_base_module_allgather_list_fn_t allgather_list; orte_grpcomm_base_module_barrier_fn_t barrier; /* modex functions */ orte_grpcomm_base_module_modex_set_proc_attr_fn_t set_proc_attr; diff --git a/orte/mca/grpcomm/grpcomm_types.h b/orte/mca/grpcomm/grpcomm_types.h index bdfd5230b9..ddb8877665 100644 --- a/orte/mca/grpcomm/grpcomm_types.h +++ b/orte/mca/grpcomm/grpcomm_types.h @@ -9,6 +9,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -41,28 +43,79 @@ BEGIN_C_DECLS -/* - * Define routing modes +/* Define a collective callback function - this will + * be called upon completion of collective ops such + * as modex and barrier. */ -typedef uint8_t orte_grpcomm_mode_t; -#define ORTE_GRPCOMM_MODE_T OPAL_UINT8 +typedef void (*orte_grpcomm_collective_cbfunc_t)(opal_buffer_t *data, void *cbdata); -/* daemon N relays message to daemon N+1 */ -#define ORTE_GRPCOMM_CHAIN (orte_grpcomm_mode_t) 1 -/* binomial tree */ -#define ORTE_GRPCOMM_BINOMIAL (orte_grpcomm_mode_t) 2 -/* linear - HNP sends direct to all daemons */ -#define ORTE_GRPCOMM_LINEAR (orte_grpcomm_mode_t) 3 +/* forward define the struct */ +struct orte_grpcomm_collective_t; -/* - * Define collective types - */ -typedef uint8_t orte_grpcomm_coll_t; -#define ORTE_GRPCOMM_COLL_T OPAL_UINT8 +typedef int32_t orte_grpcomm_coll_id_t; +#define ORTE_GRPCOMM_COLL_ID_T OPAL_INT32 +#define ORTE_GRPCOMM_COLL_ID_REQ -1 -#define ORTE_GRPCOMM_COLL_NONE 0x00 -#define ORTE_GRPCOMM_BARRIER 0x01 -#define ORTE_GRPCOMM_ALLGATHER 0x02 +typedef int8_t orte_grpcomm_coll_t; +#define ORTE_GRPCOMM_XCAST 1 +#define ORTE_GRPCOMM_COLL_RELAY 2 +#define ORTE_GRPCOMM_COLL_COMPLETE 3 +#define ORTE_GRPCOMM_COLL_PEERS 4 + +typedef enum { + ORTE_GRPCOMM_INTERNAL_STG_LOCAL, + ORTE_GRPCOMM_INTERNAL_STG_GLOBAL, + ORTE_GRPCOMM_INTERNAL_STG_APP +} orte_grpcomm_internal_stage_t; + +/* structure for tracking collective operations */ +struct orte_grpcomm_collective_t { + opal_list_item_t super; + orte_grpcomm_coll_id_t id; + /* flag that user can poll on to know when collective + * has completed - set to false just prior to + * calling user callback function, if non-NULL + */ + bool active; + /* number of local contributors */ + orte_vpid_t num_local_recvd; + /* bucket to collect local contributions */ + opal_buffer_t local_bucket; + /* number of buckets collected from peers */ + orte_vpid_t num_peer_buckets; + /* total number of contributors */ + orte_vpid_t num_global_recvd; + /* flag to mark that the collective is locally complete - i.e., + * all local contributions have been recvd and the local + * data has been entered into the global collective + */ + bool locally_complete; + /* list of names of those participating in the collective - an + * entry with vpid=WILDCARD implies that all members of that + * job must participate in the collective + */ + opal_list_t participants; + /* user callback function to be executed when collective + * is completed + */ + orte_grpcomm_collective_cbfunc_t cbfunc; + void *cbdata; + /* buffer collecting data to be delivered to user */ + opal_buffer_t buffer; + /* list of names of procs to receive the next step + * in executing the collective - this is obtained from + * the routed framework to minimize hops + */ + opal_list_t targets; + /* some collectives wrap around and call internal + * steps before completing - e.g., modex. This + * points the collective to the next step in the procedure + */ + orte_grpcomm_collective_cbfunc_t next_cb; + void *next_cbdata; +}; +typedef struct orte_grpcomm_collective_t orte_grpcomm_collective_t; +ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_grpcomm_collective_t); END_C_DECLS diff --git a/orte/mca/sensor/heartbeat/.ompi_ignore b/orte/mca/grpcomm/hier/.ompi_ignore similarity index 100% rename from orte/mca/sensor/heartbeat/.ompi_ignore rename to orte/mca/grpcomm/hier/.ompi_ignore diff --git a/orte/mca/grpcomm/hier/grpcomm_hier_module.c b/orte/mca/grpcomm/hier/grpcomm_hier_module.c index 331f4e6da9..dd54a6173a 100644 --- a/orte/mca/grpcomm/hier/grpcomm_hier_module.c +++ b/orte/mca/grpcomm/hier/grpcomm_hier_module.c @@ -93,7 +93,6 @@ static int init(void) my_local_rank_zero_proc.jobid = ORTE_PROC_MY_NAME->jobid; my_local_rank_zero_proc.vpid = ORTE_VPID_INVALID; - ORTE_EPOCH_SET(my_local_rank_zero_proc.epoch,ORTE_EPOCH_MIN); if (ORTE_SUCCESS != (rc = orte_grpcomm_base_modex_init())) { ORTE_ERROR_LOG(rc); @@ -268,7 +267,6 @@ static int hier_allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf) proc.jobid = ORTE_PROC_MY_NAME->jobid; for (v=0; v < orte_process_info.num_procs; v++) { proc.vpid = v; - ORTE_EPOCH_SET(proc.epoch,orte_util_lookup_epoch(&proc)); /* is this proc local_rank=0 on its node? */ if (0 == my_local_rank && 0 == orte_ess.get_local_rank(&proc)) { @@ -283,7 +281,6 @@ static int hier_allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf) nm = OBJ_NEW(orte_namelist_t); nm->name.jobid = proc.jobid; nm->name.vpid = proc.vpid; - ORTE_EPOCH_SET(nm->name.epoch,proc.epoch); opal_list_append(&my_local_peers, &nm->item); /* if I am not local_rank=0, is this one? */ @@ -291,7 +288,6 @@ static int hier_allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf) 0 == orte_ess.get_local_rank(&proc)) { my_local_rank_zero_proc.jobid = proc.jobid; my_local_rank_zero_proc.vpid = proc.vpid; - ORTE_EPOCH_SET(my_local_rank_zero_proc.epoch,proc.epoch); } } diff --git a/orte/mca/grpcomm/pmi/grpcomm_pmi_module.c b/orte/mca/grpcomm/pmi/grpcomm_pmi_module.c index 1bdc574069..2fbc444ee1 100644 --- a/orte/mca/grpcomm/pmi/grpcomm_pmi_module.c +++ b/orte/mca/grpcomm/pmi/grpcomm_pmi_module.c @@ -3,7 +3,7 @@ * Copyright (c) 2007 The Trustees of Indiana University. * All rights reserved. * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2011 Los Alamos National Security, LLC. All + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. All * rights reserved. * $COPYRIGHT$ * @@ -40,16 +40,14 @@ static void finalize(void); static int xcast(orte_jobid_t job, opal_buffer_t *buffer, orte_rml_tag_t tag); -static int pmi_allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf); -static int pmi_allgather_list(opal_list_t *names, - opal_buffer_t *sbuf, opal_buffer_t *rbuf); -static int pmi_barrier(void); +static int pmi_allgather(orte_grpcomm_collective_t *coll); +static int pmi_barrier(orte_grpcomm_collective_t *coll); static int pmi_set_proc_attr(const char* attr_name, const void *buffer, size_t size); static int pmi_get_proc_attr(const orte_process_name_t name, const char* attr_name, void **buffer, size_t *size); -static int modex(opal_list_t *procs); +static int modex(orte_grpcomm_collective_t *coll); static int purge_proc_attrs(void); /* Module def */ @@ -58,7 +56,6 @@ orte_grpcomm_base_module_t orte_grpcomm_pmi_module = { finalize, xcast, pmi_allgather, - pmi_allgather_list, pmi_barrier, pmi_set_proc_attr, pmi_get_proc_attr, @@ -165,7 +162,7 @@ static int xcast(orte_jobid_t job, return ORTE_ERR_NOT_SUPPORTED; } -static int pmi_barrier(void) +static int pmi_barrier(orte_grpcomm_collective_t *coll) { int rc; @@ -173,11 +170,15 @@ static int pmi_barrier(void) "%s grpcomm:pmi entering barrier", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - /* if I am alone, just return */ + /* if I am alone, just execute the callback */ if (1 == orte_process_info.num_procs) { OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output, "%s grpcomm:pmi:barrier only one proc", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + coll->active = false; + if (NULL != coll->cbfunc) { + coll->cbfunc(NULL, coll->cbdata); + } return ORTE_SUCCESS; } @@ -198,25 +199,21 @@ static int pmi_barrier(void) OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output, "%s grpcomm:pmi barrier complete", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + /* execute the callback */ + coll->active = false; + if (NULL != coll->cbfunc) { + coll->cbfunc(NULL, coll->cbdata); + } return ORTE_SUCCESS; } -static int pmi_allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf) +static int pmi_allgather(orte_grpcomm_collective_t *coll) { /* not used in this implementation */ return ORTE_ERR_NOT_SUPPORTED; } -static int pmi_allgather_list(opal_list_t *names, - opal_buffer_t *sbuf, opal_buffer_t *rbuf) -{ - /* no idea how to do this - only occurs for comm_spawn, - * which this module doesn't support - */ - return ORTE_ERR_NOT_SUPPORTED; -} - static int pmi_set_proc_attr(const char* attr_name, const void *buffer, size_t size) { @@ -285,7 +282,7 @@ static int pmi_get_proc_attr(const orte_process_name_t name, } /*** MODEX SECTION ***/ -static int modex(opal_list_t *procs) +static int modex(orte_grpcomm_collective_t *coll) { int rc, i; size_t len; @@ -520,28 +517,17 @@ static int modex(opal_list_t *procs) rc = kvs_get(pmi_kvs_key, pmi_attr_val, pmi_vallen_max); /* don't error out here - if not found, that's okay */ if (PMI_SUCCESS == rc) { - if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &name, ORTE_PROC_MY_NAME)) { + if (name.jobid == ORTE_PROC_MY_NAME->jobid && + name.vpid == ORTE_PROC_MY_NAME->vpid) { /* if this data is from myself, then set locality to all */ - OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output, - "%s grpcomm:pmi setting proc %s locale ALL", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&name))); - pmap->locality = OPAL_PROC_ALL_LOCAL; + pmap->locality = OPAL_PROC_ALL_LOCAL; } else if (loc->daemon != ORTE_PROC_MY_DAEMON->vpid) { /* this is on a different node, then mark as non-local */ - OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output, - "%s grpcomm:pmi setting proc %s locale NONLOCAL", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&name))); pmap->locality = OPAL_PROC_NON_LOCAL; } else if (0 == strlen(pmi_attr_val)){ /* if we share a node, but we don't know anything more, then * mark us as on the node as this is all we know */ - OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output, - "%s grpcomm:pmi setting proc %s locale NODE", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&name))); pmap->locality = OPAL_PROC_ON_NODE; } else { bind_level = strtol(pmi_attr_val, NULL, 10); @@ -560,13 +546,13 @@ static int modex(opal_list_t *procs) orte_process_info.bind_level, orte_process_info.bind_idx, bind_level, bind_idx); - OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output, - "%s grpcommpmi setting proc %s locale %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&name), - opal_hwloc_base_print_locality(pmap->locality))); } } + OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output, + "%s grpcomm:pmi setting proc %s locale %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&name), + opal_hwloc_base_print_locality(pmap->locality))); } } #endif @@ -575,7 +561,12 @@ static int modex(opal_list_t *procs) OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output, "%s grpcomm:pmi: modex completed", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - + + /* execute the callback */ + coll->active = false; + if (NULL != coll->cbfunc) { + coll->cbfunc(NULL, coll->cbdata); + } return rc; } diff --git a/orte/mca/grpcomm/portals4_shmem/grpcomm_portals4_shmem_module.c b/orte/mca/grpcomm/portals4_shmem/grpcomm_portals4_shmem_module.c index 6fc4521341..0132036bc8 100644 --- a/orte/mca/grpcomm/portals4_shmem/grpcomm_portals4_shmem_module.c +++ b/orte/mca/grpcomm/portals4_shmem/grpcomm_portals4_shmem_module.c @@ -11,6 +11,8 @@ * All rights reserved. * Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2010 Sandia National Laboratories. All rights reserved. + * Copyright (c) 2012 Los Alamos National Security, LLC. All + * rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -46,11 +48,9 @@ static int xcast(orte_jobid_t job, opal_buffer_t *buffer, orte_rml_tag_t tag); -static int orte_grpcomm_portals4_shmem_barrier(void); +static int orte_grpcomm_portals4_shmem_barrier(orte_grpcomm_collective_t *coll); -static int allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf); - -static int allgather_list(opal_list_t *names, opal_buffer_t *sbuf, opal_buffer_t *rbuf); +static int allgather(orte_grpcomm_collective_t *coll); static int set_proc_attr(const char *attr_name, const void *data, @@ -60,7 +60,7 @@ static int get_proc_attr(const orte_process_name_t proc, const char * attribute_name, void **val, size_t *size); -static int modex(opal_list_t *procs); +static int modex(orte_grpcomm_collective_t *coll); static int purge_proc_attrs(void); @@ -69,7 +69,6 @@ orte_grpcomm_base_module_t orte_grpcomm_portals4_shmem_module = { finalize, xcast, allgather, - allgather_list, orte_grpcomm_portals4_shmem_barrier, set_proc_attr, get_proc_attr, @@ -113,35 +112,33 @@ static int xcast(orte_jobid_t job, } static int -orte_grpcomm_portals4_shmem_barrier(void) +orte_grpcomm_portals4_shmem_barrier(orte_grpcomm_collective_t *coll) { runtime_barrier(); - + coll->active = false; + if (NULL != coll->cbfunc) { + coll->cbfunc(NULL, coll->cbdata); + } return ORTE_SUCCESS; } -static int allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf) +static int allgather(orte_grpcomm_collective_t *coll) { int rc; orte_std_cntr_t zero=0; - - /* seed the outgoing buffer with num_procs=0 so it won't be unpacked */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(rbuf, &zero, 1, ORTE_STD_CNTR))) { - ORTE_ERROR_LOG(rc); - return rc; - } - return rc; -} + opal_buffer_t rbuf; -static int allgather_list(opal_list_t *names, opal_buffer_t *sbuf, opal_buffer_t *rbuf) -{ - int rc; - orte_std_cntr_t zero=0; - - /* seed the outgoing buffer with num_procs=0 so it won't be unpacked */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(rbuf, &zero, 1, ORTE_STD_CNTR))) { - ORTE_ERROR_LOG(rc); - return rc; + coll->active = false; + if (NULL != coll->cbfunc) { + /* seed the outgoing buffer with num_procs=0 so it won't be unpacked */ + OBJ_CONSTRUCT(&rbuf, opal_buffer_t); + if (ORTE_SUCCESS != (rc = opal_dss.pack(&rbuf, &zero, 1, ORTE_STD_CNTR))) { + ORTE_ERROR_LOG(rc); + OBJ_DESTRUCT(&rbuf); + return rc; + } + coll->cbfunc(&rbuf, coll->cbdata); + OBJ_DESTRUCT(&rbuf); } return rc; } @@ -186,8 +183,12 @@ static int get_proc_attr(const orte_process_name_t proc, return ORTE_ERR_NOT_IMPLEMENTED; } -static int modex(opal_list_t *procs) +static int modex(orte_grpcomm_collective_t *coll) { + coll->active = false; + if (NULL != coll->cbfunc) { + coll->cbfunc(NULL, coll->cbdata); + } return ORTE_SUCCESS; } diff --git a/orte/mca/iof/base/base.h b/orte/mca/iof/base/base.h index 248def66fc..5b92eb3a29 100644 --- a/orte/mca/iof/base/base.h +++ b/orte/mca/iof/base/base.h @@ -64,7 +64,7 @@ ORTE_DECLSPEC int orte_iof_base_open(void); typedef struct { opal_list_item_t super; bool pending; - opal_event_t ev; + opal_event_t *ev; int fd; opal_list_t outputs; } orte_iof_write_event_t; @@ -86,7 +86,7 @@ ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_iof_sink_t); typedef struct { opal_object_t super; orte_process_name_t name; - opal_event_t ev; + opal_event_t *ev; int fd; orte_iof_tag_t tag; bool active; @@ -135,12 +135,11 @@ typedef struct orte_iof_base_t orte_iof_base_t; ep = OBJ_NEW(orte_iof_sink_t); \ ep->name.jobid = (nm)->jobid; \ ep->name.vpid = (nm)->vpid; \ - ORTE_EPOCH_SET(ep->name.epoch,(nm)->epoch); \ ep->tag = (tg); \ if (0 <= (fid)) { \ ep->wev->fd = (fid); \ - opal_event_set(opal_event_base, \ - &(ep->wev->ev), ep->wev->fd, \ + opal_event_set(orte_event_base, \ + ep->wev->ev, ep->wev->fd, \ OPAL_EV_WRITE, \ wrthndlr, ep); \ } \ @@ -169,19 +168,18 @@ typedef struct orte_iof_base_t orte_iof_base_t; rev = OBJ_NEW(orte_iof_read_event_t); \ rev->name.jobid = (nm)->jobid; \ rev->name.vpid = (nm)->vpid; \ - ORTE_EPOCH_SET(rev->name.epoch,(nm)->epoch); \ rev->tag = (tg); \ rev->fd = (fid); \ *(rv) = rev; \ rev->file = strdup(__FILE__); \ rev->line = __LINE__; \ - opal_event_set(opal_event_base, \ - &rev->ev, (fid), \ + opal_event_set(orte_event_base, \ + rev->ev, (fid), \ OPAL_EV_READ, \ (cbfunc), rev); \ if ((actv)) { \ rev->active = true; \ - opal_event_add(&rev->ev, 0); \ + opal_event_add(rev->ev, 0); \ } \ } while(0); @@ -194,12 +192,11 @@ typedef struct orte_iof_base_t orte_iof_base_t; ep = OBJ_NEW(orte_iof_sink_t); \ ep->name.jobid = (nm)->jobid; \ ep->name.vpid = (nm)->vpid; \ - ORTE_EPOCH_SET(ep->name.epoch,(nm)->epoch); \ ep->tag = (tg); \ if (0 <= (fid)) { \ ep->wev->fd = (fid); \ - opal_event_set(opal_event_base, \ - &(ep->wev->ev), ep->wev->fd, \ + opal_event_set(orte_event_base, \ + ep->wev->ev, ep->wev->fd, \ OPAL_EV_WRITE, \ wrthndlr, ep); \ } \ @@ -215,17 +212,16 @@ typedef struct orte_iof_base_t orte_iof_base_t; rev = OBJ_NEW(orte_iof_read_event_t); \ rev->name.jobid = (nm)->jobid; \ rev->name.vpid = (nm)->vpid; \ - ORTE_EPOCH_SET(rev->name.epoch,(nm)->epoch); \ rev->tag = (tg); \ rev->fd = (fid); \ *(rv) = rev; \ - opal_event_set(opal_event_base, \ - &rev->ev, (fid), \ + opal_event_set(orte_event_base, \ + rev->ev, (fid), \ OPAL_EV_READ, \ (cbfunc), rev); \ if ((actv)) { \ rev->active = true; \ - opal_event_add(&rev->ev, 0); \ + opal_event_add(rev->ev, 0); \ } \ } while(0); diff --git a/orte/mca/iof/base/iof_base_close.c b/orte/mca/iof/base/iof_base_close.c index a67abe68e9..4ff07f8075 100644 --- a/orte/mca/iof/base/iof_base_close.c +++ b/orte/mca/iof/base/iof_base_close.c @@ -42,9 +42,6 @@ int orte_iof_base_close(void) } OBJ_DESTRUCT(&orte_iof_base.iof_components_opened); - OBJ_DESTRUCT(&orte_iof_base.iof_write_output_lock); - - return ORTE_SUCCESS; } diff --git a/orte/mca/iof/base/iof_base_open.c b/orte/mca/iof/base/iof_base_open.c index b3a20fd03f..1e929f56b1 100644 --- a/orte/mca/iof/base/iof_base_open.c +++ b/orte/mca/iof/base/iof_base_open.c @@ -91,7 +91,6 @@ static void orte_iof_base_sink_construct(orte_iof_sink_t* ptr) { ptr->daemon.jobid = ORTE_JOBID_INVALID; ptr->daemon.vpid = ORTE_VPID_INVALID; - ORTE_EPOCH_SET(ptr->daemon.epoch,ORTE_EPOCH_MIN); ptr->wev = OBJ_NEW(orte_iof_write_event_t); } static void orte_iof_base_sink_destruct(orte_iof_sink_t* ptr) @@ -114,10 +113,11 @@ static void orte_iof_base_read_event_construct(orte_iof_read_event_t* rev) { rev->fd = -1; rev->active = false; + rev->ev = opal_event_alloc(); } static void orte_iof_base_read_event_destruct(orte_iof_read_event_t* rev) { - opal_event_del(&rev->ev); + opal_event_free(rev->ev); if (0 <= rev->fd) { OPAL_OUTPUT_VERBOSE((20, orte_iof_base.iof_output, "%s iof: closing fd %d for process %s", @@ -137,12 +137,11 @@ static void orte_iof_base_write_event_construct(orte_iof_write_event_t* wev) wev->pending = false; wev->fd = -1; OBJ_CONSTRUCT(&wev->outputs, opal_list_t); + wev->ev = opal_event_alloc(); } static void orte_iof_base_write_event_destruct(orte_iof_write_event_t* wev) { - if (wev->pending) { - opal_event_del(&wev->ev); - } + opal_event_free(wev->ev); if (ORTE_PROC_IS_HNP) { int xmlfd = fileno(orte_xml_fp); if (xmlfd == wev->fd) { diff --git a/orte/mca/iof/base/iof_base_output.c b/orte/mca/iof/base/iof_base_output.c index 554d32683a..f18ec7d0a1 100644 --- a/orte/mca/iof/base/iof_base_output.c +++ b/orte/mca/iof/base/iof_base_output.c @@ -266,7 +266,7 @@ process: OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, "%s write:output adding write event", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - opal_event_add(&channel->ev, 0); + opal_event_add(channel->ev, 0); channel->pending = true; } @@ -322,7 +322,7 @@ void orte_iof_base_write_handler(int fd, short event, void *cbdata) OBJ_RELEASE(output); } ABORT: - opal_event_del(&wev->ev); + opal_event_del(wev->ev); wev->pending = false; DEPART: diff --git a/orte/mca/iof/hnp/iof_hnp.c b/orte/mca/iof/hnp/iof_hnp.c index d07a4d1b08..9e34d1959c 100644 --- a/orte/mca/iof/hnp/iof_hnp.c +++ b/orte/mca/iof/hnp/iof_hnp.c @@ -11,6 +11,8 @@ * All rights reserved. * Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -144,9 +146,7 @@ static int hnp_push(const orte_process_name_t* dst_name, orte_iof_tag_t src_tag, int flags; char *outfile; int fdout; - orte_odls_job_t *jobdat=NULL; int np, numdigs; - int rc; orte_ns_cmp_bitmask_t mask; /* don't do this if the dst vpid is invalid or the fd is negative! */ @@ -185,24 +185,15 @@ static int hnp_push(const orte_process_name_t* dst_name, orte_iof_tag_t src_tag, proct = OBJ_NEW(orte_iof_proc_t); proct->name.jobid = dst_name->jobid; proct->name.vpid = dst_name->vpid; - ORTE_EPOCH_SET(proct->name.epoch,dst_name->epoch); opal_list_append(&mca_iof_hnp_component.procs, &proct->super); /* see if we are to output to a file */ if (NULL != orte_output_filename) { - /* get the local jobdata for this proc */ - for (item = opal_list_get_first(&orte_local_jobdata); - item != opal_list_get_end(&orte_local_jobdata); - item = opal_list_get_next(item)) { - jobdat = (orte_odls_job_t*)item; - if (jobdat->jobid == proct->name.jobid) { - break; - } - } - if (NULL == jobdat) { + /* get the jobdata for this proc */ + if (NULL == (jdata = orte_get_job_data_object(dst_name->jobid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); return ORTE_ERR_NOT_FOUND; } - np = jobdat->num_procs / 10; + np = jdata->num_procs / 10; /* determine the number of digits required for max vpid */ numdigs = 1; while (np > 0) { @@ -246,11 +237,11 @@ static int hnp_push(const orte_process_name_t* dst_name, orte_iof_tag_t src_tag, */ if (NULL != proct->revstdout && NULL != proct->revstderr && NULL != proct->revstddiag) { proct->revstdout->active = true; - opal_event_add(&(proct->revstdout->ev), 0); + opal_event_add(proct->revstdout->ev, 0); proct->revstderr->active = true; - opal_event_add(&(proct->revstderr->ev), 0); + opal_event_add(proct->revstderr->ev, 0); proct->revstddiag->active = true; - opal_event_add(&(proct->revstddiag->ev), 0); + opal_event_add(proct->revstddiag->ev, 0); } return ORTE_SUCCESS; } @@ -282,7 +273,6 @@ static int hnp_push(const orte_process_name_t* dst_name, orte_iof_tag_t src_tag, &mca_iof_hnp_component.sinks); sink->daemon.jobid = ORTE_PROC_MY_NAME->jobid; sink->daemon.vpid = proc->node->daemon->name.vpid; - ORTE_EPOCH_SET(sink->daemon.epoch,orte_ess.proc_get_epoch(&sink->daemon)); } } @@ -315,7 +305,7 @@ static int hnp_push(const orte_process_name_t* dst_name, orte_iof_tag_t src_tag, * filedescriptor is not a tty, don't worry about it * and always stay connected. */ - opal_event_signal_set(opal_event_base, &mca_iof_hnp_component.stdinsig, + opal_event_signal_set(orte_event_base, &mca_iof_hnp_component.stdinsig, SIGCONT, orte_iof_hnp_stdin_cb, NULL); @@ -334,9 +324,7 @@ static int hnp_push(const orte_process_name_t* dst_name, orte_iof_tag_t src_tag, */ if (!(src_tag & ORTE_IOF_STDIN) || orte_iof_hnp_stdin_check(fd)) { mca_iof_hnp_component.stdinev->active = true; - if (OPAL_SUCCESS != (rc = opal_event_add(&(mca_iof_hnp_component.stdinev->ev), 0))) { - ORTE_ERROR_LOG(rc); - } + opal_event_add(mca_iof_hnp_component.stdinev->ev, 0); } } else { /* if we are not looking at a tty, just setup a read event @@ -389,7 +377,6 @@ static int hnp_pull(const orte_process_name_t* dst_name, &mca_iof_hnp_component.sinks); sink->daemon.jobid = ORTE_PROC_MY_NAME->jobid; sink->daemon.vpid = ORTE_PROC_MY_NAME->vpid; - ORTE_EPOCH_SET(sink->daemon.epoch,ORTE_PROC_MY_NAME->epoch); return ORTE_SUCCESS; } @@ -436,9 +423,6 @@ static int finalize(void) int num_written; bool dump; - OPAL_THREAD_LOCK(&mca_iof_hnp_component.lock); - - OPAL_THREAD_LOCK(&orte_iof_base.iof_write_output_lock); /* check if anything is still trying to be written out */ wev = orte_iof_base.iof_write_stdout->wev; if (!opal_list_is_empty(&wev->outputs)) { @@ -456,7 +440,6 @@ static int finalize(void) OBJ_RELEASE(output); } } - OBJ_RELEASE(orte_iof_base.iof_write_stdout); if (!orte_xml_output) { /* we only opened stderr channel if we are NOT doing xml output */ wev = orte_iof_base.iof_write_stderr->wev; @@ -475,31 +458,10 @@ static int finalize(void) OBJ_RELEASE(output); } } - OBJ_RELEASE(orte_iof_base.iof_write_stderr); } - OPAL_THREAD_UNLOCK(&orte_iof_base.iof_write_output_lock); - /* if the stdin event is active, delete it */ - if (NULL != mca_iof_hnp_component.stdinev) { - OBJ_RELEASE(mca_iof_hnp_component.stdinev); - opal_event_signal_del(&mca_iof_hnp_component.stdinsig); - } - /* cleanout all registered sinks */ - while ((item = opal_list_remove_first(&mca_iof_hnp_component.sinks)) != NULL) { - OBJ_RELEASE(item); - } - OBJ_DESTRUCT(&mca_iof_hnp_component.sinks); - /* cleanout all pending proc objects holding receive events */ - while ((item = opal_list_remove_first(&mca_iof_hnp_component.procs)) != NULL) { - OBJ_RELEASE(item); - } - OBJ_DESTRUCT(&mca_iof_hnp_component.procs); orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_IOF_HNP); - /* release and cleanup the lock */ - OPAL_THREAD_UNLOCK(&mca_iof_hnp_component.lock); - OBJ_DESTRUCT(&mca_iof_hnp_component.lock); - return ORTE_SUCCESS; } @@ -564,7 +526,7 @@ static void stdin_write_handler(int fd, short event, void *cbdata) * when the fd is ready. */ wev->pending = true; - opal_event_add(&wev->ev, 0); + opal_event_add(wev->ev, 0); goto CHECK; } /* otherwise, something bad happened so all we can do is declare an @@ -589,7 +551,7 @@ static void stdin_write_handler(int fd, short event, void *cbdata) * when the fd is ready. */ wev->pending = true; - opal_event_add(&wev->ev, 0); + opal_event_add(wev->ev, 0); goto CHECK; } OBJ_RELEASE(output); @@ -616,7 +578,7 @@ CHECK: OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, "restarting read event")); mca_iof_hnp_component.stdinev->active = true; - opal_event_add(&(mca_iof_hnp_component.stdinev->ev), 0); + opal_event_add(mca_iof_hnp_component.stdinev->ev, 0); } } diff --git a/orte/mca/iof/hnp/iof_hnp_read.c b/orte/mca/iof/hnp/iof_hnp_read.c index 2e94e38725..c98833ed3d 100644 --- a/orte/mca/iof/hnp/iof_hnp_read.c +++ b/orte/mca/iof/hnp/iof_hnp_read.c @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -30,12 +32,12 @@ #include "opal/dss/dss.h" -#include "orte/mca/rml/rml_types.h" +#include "orte/mca/rml/rml.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/odls/odls_types.h" #include "orte/util/name_fns.h" +#include "orte/mca/state/state.h" #include "orte/runtime/orte_globals.h" -#include "orte/orted/orted.h" #include "orte/mca/iof/iof.h" #include "orte/mca/iof/base/base.h" @@ -44,10 +46,18 @@ static void restart_stdin(int fd, short event, void *cbdata) { + orte_timer_t *tm = (orte_timer_t*)cbdata; + if (NULL != mca_iof_hnp_component.stdinev && - !orte_job_term_ordered) { + !orte_job_term_ordered && + !mca_iof_hnp_component.stdinev->active) { mca_iof_hnp_component.stdinev->active = true; - opal_event_add(&(mca_iof_hnp_component.stdinev->ev), 0); + opal_event_add(mca_iof_hnp_component.stdinev->ev, 0); + } + + /* if this was a timer callback, then release the timer */ + if (NULL != tm) { + OBJ_RELEASE(tm); } } @@ -70,9 +80,9 @@ void orte_iof_hnp_stdin_cb(int fd, short event, void *cbdata) if (should_process) { mca_iof_hnp_component.stdinev->active = true; - opal_event_add(&(mca_iof_hnp_component.stdinev->ev), 0); + opal_event_add(mca_iof_hnp_component.stdinev->ev, 0); } else { - opal_event_del(&(mca_iof_hnp_component.stdinev->ev)); + opal_event_del(mca_iof_hnp_component.stdinev->ev); mca_iof_hnp_component.stdinev->active = false; } } @@ -109,7 +119,7 @@ void orte_iof_hnp_read_local_handler(int fd, short event, void *cbdata) /* non-blocking, retry */ if (EAGAIN == errno || EINTR == errno) { - opal_event_add(&rev->ev, 0); + opal_event_add(rev->ev, 0); OPAL_THREAD_UNLOCK(&mca_iof_hnp_component.lock); return; } @@ -207,7 +217,7 @@ void orte_iof_hnp_read_local_handler(int fd, short event, void *cbdata) restart_stdin(fd, 0, NULL); } else { /* delay for awhile and then restart */ - ORTE_TIMER_EVENT(0, 10000, restart_stdin); + ORTE_TIMER_EVENT(0, 10000, restart_stdin, ORTE_INFO_PRI); } } /* nothing more to do */ @@ -275,24 +285,9 @@ void orte_iof_hnp_read_local_handler(int fd, short event, void *cbdata) if (NULL == proct->revstdout && NULL == proct->revstderr && NULL == proct->revstddiag) { - opal_buffer_t cmdbuf; - orte_daemon_cmd_flag_t command; /* this proc's iof is complete */ opal_list_remove_item(&mca_iof_hnp_component.procs, item); - /* setup a cmd to notify that the iof is complete */ - OBJ_CONSTRUCT(&cmdbuf, opal_buffer_t); - command = ORTE_DAEMON_IOF_COMPLETE; - if (ORTE_SUCCESS != (rc = opal_dss.pack(&cmdbuf, &command, 1, ORTE_DAEMON_CMD))) { - ORTE_ERROR_LOG(rc); - goto CLEANUP; - } - if (ORTE_SUCCESS != (rc = opal_dss.pack(&cmdbuf, &proct->name, 1, ORTE_NAME))) { - ORTE_ERROR_LOG(rc); - goto CLEANUP; - } - ORTE_MESSAGE_EVENT(ORTE_PROC_MY_NAME, &cmdbuf, ORTE_RML_TAG_DAEMON, orte_daemon_cmd_processor); - CLEANUP: - OBJ_DESTRUCT(&cmdbuf); + ORTE_ACTIVATE_PROC_STATE(&proct->name, ORTE_PROC_STATE_IOF_COMPLETE); OBJ_RELEASE(proct); } break; @@ -337,8 +332,8 @@ void orte_iof_hnp_read_local_handler(int fd, short event, void *cbdata) } /* re-add the event */ - opal_event_add(&rev->ev, 0); + opal_event_add(rev->ev, 0); - OPAL_THREAD_UNLOCK(&mca_iof_hnp_component.lock); + OPAL_THREAD_UNLOCK(&mca_iof_hnp_component.lock); return; } diff --git a/orte/mca/iof/hnp/iof_hnp_receive.c b/orte/mca/iof/hnp/iof_hnp_receive.c index 355c7ca2b5..d2e84ea4af 100644 --- a/orte/mca/iof/hnp/iof_hnp_receive.c +++ b/orte/mca/iof/hnp/iof_hnp_receive.c @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -36,7 +38,6 @@ #endif #include "orte/mca/rml/rml.h" -#include "orte/mca/rml/rml_types.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/util/name_fns.h" #include "orte/runtime/orte_globals.h" @@ -47,9 +48,10 @@ #include "iof_hnp.h" -static void process_msg(int fd, short event, void *cbdata) +void orte_iof_hnp_recv(int status, orte_process_name_t* sender, + opal_buffer_t* buffer, orte_rml_tag_t tag, + void* cbdata) { - orte_message_event_t *mev = (orte_message_event_t*)cbdata; orte_process_name_t origin; unsigned char data[ORTE_IOF_BASE_MSG_MAX]; orte_iof_tag_t stream; @@ -61,7 +63,7 @@ static void process_msg(int fd, short event, void *cbdata) /* unpack the stream first as this may be flow control info */ count = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(mev->buffer, &stream, &count, ORTE_IOF_TAG))) { + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &stream, &count, ORTE_IOF_TAG))) { ORTE_ERROR_LOG(rc); goto CLEAN_RETURN; } @@ -72,14 +74,14 @@ static void process_msg(int fd, short event, void *cbdata) !orte_job_term_ordered && !mca_iof_hnp_component.stdinev->active) { mca_iof_hnp_component.stdinev->active = true; - opal_event_add(&(mca_iof_hnp_component.stdinev->ev), 0); + opal_event_add(mca_iof_hnp_component.stdinev->ev, 0); } goto CLEAN_RETURN; } else if (ORTE_IOF_XOFF & stream) { /* stop the stdin read event */ if (NULL != mca_iof_hnp_component.stdinev && !mca_iof_hnp_component.stdinev->active) { - opal_event_del(&(mca_iof_hnp_component.stdinev->ev)); + opal_event_del(mca_iof_hnp_component.stdinev->ev); mca_iof_hnp_component.stdinev->active = false; } goto CLEAN_RETURN; @@ -87,7 +89,7 @@ static void process_msg(int fd, short event, void *cbdata) /* get name of the process whose io we are discussing */ count = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(mev->buffer, &origin, &count, ORTE_NAME))) { + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &origin, &count, ORTE_NAME))) { ORTE_ERROR_LOG(rc); goto CLEAN_RETURN; } @@ -97,7 +99,7 @@ static void process_msg(int fd, short event, void *cbdata) OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, "%s received pull cmd from remote tool %s for proc %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&mev->sender), + ORTE_NAME_PRINT(sender), ORTE_NAME_PRINT(&origin))); /* a tool is requesting that we send it a copy of the specified stream(s) * from the specified process(es), so create a sink for it @@ -105,23 +107,20 @@ static void process_msg(int fd, short event, void *cbdata) if (ORTE_IOF_STDOUT & stream) { ORTE_IOF_SINK_DEFINE(&sink, &origin, -1, ORTE_IOF_STDOUT, NULL, &mca_iof_hnp_component.sinks); - sink->daemon.jobid = mev->sender.jobid; - sink->daemon.vpid = mev->sender.vpid; - ORTE_EPOCH_SET(sink->daemon.epoch,mev->sender.epoch); + sink->daemon.jobid = sender->jobid; + sink->daemon.vpid = sender->vpid; } if (ORTE_IOF_STDERR & stream) { ORTE_IOF_SINK_DEFINE(&sink, &origin, -1, ORTE_IOF_STDERR, NULL, &mca_iof_hnp_component.sinks); - sink->daemon.jobid = mev->sender.jobid; - sink->daemon.vpid = mev->sender.vpid; - ORTE_EPOCH_SET(sink->daemon.epoch,mev->sender.epoch); + sink->daemon.jobid = sender->jobid; + sink->daemon.vpid = sender->vpid; } if (ORTE_IOF_STDDIAG & stream) { ORTE_IOF_SINK_DEFINE(&sink, &origin, -1, ORTE_IOF_STDDIAG, NULL, &mca_iof_hnp_component.sinks); - sink->daemon.jobid = mev->sender.jobid; - sink->daemon.vpid = mev->sender.vpid; - ORTE_EPOCH_SET(sink->daemon.epoch,mev->sender.epoch); + sink->daemon.jobid = sender->jobid; + sink->daemon.vpid = sender->vpid; } goto CLEAN_RETURN; } @@ -130,7 +129,7 @@ static void process_msg(int fd, short event, void *cbdata) OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, "%s received close cmd from remote tool %s for proc %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&mev->sender), + ORTE_NAME_PRINT(sender), ORTE_NAME_PRINT(&origin))); /* a tool is requesting that we no longer forward a copy of the * specified stream(s) from the specified process(es) - remove the sink @@ -163,7 +162,7 @@ static void process_msg(int fd, short event, void *cbdata) /* this must have come from a daemon forwarding output - unpack the data */ numbytes=ORTE_IOF_BASE_MSG_MAX; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(mev->buffer, data, &numbytes, OPAL_BYTE))) { + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, data, &numbytes, OPAL_BYTE))) { ORTE_ERROR_LOG(rc); goto CLEAN_RETURN; } @@ -201,30 +200,5 @@ static void process_msg(int fd, short event, void *cbdata) } CLEAN_RETURN: - /* release the message event */ - OBJ_RELEASE(mev); - return; -} - -void orte_iof_hnp_recv(int status, orte_process_name_t* sender, - opal_buffer_t* buffer, orte_rml_tag_t tag, - void* cbdata) -{ - OPAL_OUTPUT_VERBOSE((5, orte_iof_base.iof_output, - "%s iof:hnp:receive got message from %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(sender))); - - /* don't process this right away - we need to get out of the recv before - * we process the message to avoid performing the rest of the job while - * inside this receive! Instead, setup an event so that the message gets processed - * as soon as we leave the recv. - * - * The macro makes a copy of the buffer, which we release above - the incoming - * buffer, however, is NOT released here, although its payload IS transferred - * to the message buffer for later processing - */ - ORTE_MESSAGE_EVENT(sender, buffer, tag, process_msg); - return; } diff --git a/orte/mca/iof/orted/iof_orted.c b/orte/mca/iof/orted/iof_orted.c index 4c7d865c9f..dae7fef958 100644 --- a/orte/mca/iof/orted/iof_orted.c +++ b/orte/mca/iof/orted/iof_orted.c @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -124,7 +126,7 @@ static int orted_push(const orte_process_name_t* dst_name, orte_iof_tag_t src_ta orte_iof_sink_t *sink; char *outfile; int fdout; - orte_odls_job_t *jobdat=NULL; + orte_job_t *jobdat=NULL; int np, numdigs; orte_ns_cmp_bitmask_t mask; @@ -161,20 +163,11 @@ static int orted_push(const orte_process_name_t* dst_name, orte_iof_tag_t src_ta proct = OBJ_NEW(orte_iof_proc_t); proct->name.jobid = dst_name->jobid; proct->name.vpid = dst_name->vpid; - ORTE_EPOCH_SET(proct->name.epoch,dst_name->epoch); opal_list_append(&mca_iof_orted_component.procs, &proct->super); /* see if we are to output to a file */ if (NULL != orte_output_filename) { /* get the local jobdata for this proc */ - for (item = opal_list_get_first(&orte_local_jobdata); - item != opal_list_get_end(&orte_local_jobdata); - item = opal_list_get_next(item)) { - jobdat = (orte_odls_job_t*)item; - if (jobdat->jobid == proct->name.jobid) { - break; - } - } - if (NULL == jobdat) { + if (NULL == (jobdat = orte_get_job_data_object(proct->name.jobid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); return ORTE_ERR_NOT_FOUND; } @@ -222,11 +215,11 @@ SETUP: */ if (NULL != proct->revstdout && NULL != proct->revstderr && NULL != proct->revstddiag) { proct->revstdout->active = true; - opal_event_add(&(proct->revstdout->ev), 0); + opal_event_add(proct->revstdout->ev, 0); proct->revstderr->active = true; - opal_event_add(&(proct->revstderr->ev), 0); + opal_event_add(proct->revstderr->ev, 0); proct->revstddiag->active = true; - opal_event_add(&(proct->revstddiag->ev), 0); + opal_event_add(proct->revstddiag->ev, 0); } return ORTE_SUCCESS; } @@ -389,7 +382,7 @@ static void stdin_write_handler(int fd, short event, void *cbdata) * when the fd is ready. */ wev->pending = true; - opal_event_add(&wev->ev, 0); + opal_event_add(wev->ev, 0); goto CHECK; } /* otherwise, something bad happened so all we can do is declare an @@ -419,7 +412,7 @@ static void stdin_write_handler(int fd, short event, void *cbdata) * when the fd is ready. */ wev->pending = true; - opal_event_add(&wev->ev, 0); + opal_event_add(wev->ev, 0); goto CHECK; } OBJ_RELEASE(output); diff --git a/orte/mca/iof/orted/iof_orted_read.c b/orte/mca/iof/orted/iof_orted_read.c index 5edeeafcf5..fd95766d69 100644 --- a/orte/mca/iof/orted/iof_orted_read.c +++ b/orte/mca/iof/orted/iof_orted_read.c @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -31,12 +33,11 @@ #include "opal/dss/dss.h" #include "orte/mca/rml/rml.h" -#include "orte/mca/rml/rml_types.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/odls/odls_types.h" #include "orte/util/name_fns.h" +#include "orte/mca/state/state.h" #include "orte/runtime/orte_globals.h" -#include "orte/orted/orted.h" #include "orte/mca/iof/iof.h" #include "orte/mca/iof/base/base.h" @@ -65,7 +66,7 @@ void orte_iof_orted_read_handler(int fd, short event, void *cbdata) opal_list_item_t *item; orte_iof_proc_t *proct; orte_ns_cmp_bitmask_t mask; - + OPAL_THREAD_LOCK(&mca_iof_orted_component.lock); /* read up to the fragment size */ @@ -90,7 +91,7 @@ void orte_iof_orted_read_handler(int fd, short event, void *cbdata) /* either we have a connection error or it was a non-blocking read */ if (EAGAIN == errno || EINTR == errno) { /* non-blocking, retry */ - opal_event_add(&rev->ev, 0); + opal_event_add(rev->ev, 0); OPAL_THREAD_UNLOCK(&mca_iof_orted_component.lock); return; } @@ -164,14 +165,14 @@ void orte_iof_orted_read_handler(int fd, short event, void *cbdata) orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buf, ORTE_RML_TAG_IOF_HNP, 0, send_cb, NULL); -RESTART: + RESTART: /* re-add the event */ - opal_event_add(&rev->ev, 0); + opal_event_add(rev->ev, 0); OPAL_THREAD_UNLOCK(&mca_iof_orted_component.lock); return; -CLEAN_RETURN: + CLEAN_RETURN: /* must be an error, or zero bytes were read indicating that the * proc terminated this IOF channel - either way, find this proc * on our list and clean up @@ -202,24 +203,9 @@ CLEAN_RETURN: if (NULL == proct->revstdout && NULL == proct->revstderr && NULL == proct->revstddiag) { - opal_buffer_t cmdbuf; - orte_daemon_cmd_flag_t command; /* this proc's iof is complete */ opal_list_remove_item(&mca_iof_orted_component.procs, item); - /* setup a cmd to notify that the iof is complete */ - OBJ_CONSTRUCT(&cmdbuf, opal_buffer_t); - command = ORTE_DAEMON_IOF_COMPLETE; - if (ORTE_SUCCESS != (rc = opal_dss.pack(&cmdbuf, &command, 1, ORTE_DAEMON_CMD))) { - ORTE_ERROR_LOG(rc); - goto CLEANUP; - } - if (ORTE_SUCCESS != (rc = opal_dss.pack(&cmdbuf, &proct->name, 1, ORTE_NAME))) { - ORTE_ERROR_LOG(rc); - goto CLEANUP; - } - ORTE_MESSAGE_EVENT(ORTE_PROC_MY_NAME, &cmdbuf, ORTE_RML_TAG_DAEMON, orte_daemon_cmd_processor); - CLEANUP: - OBJ_DESTRUCT(&cmdbuf); + ORTE_ACTIVATE_PROC_STATE(&proct->name, ORTE_PROC_STATE_IOF_COMPLETE); OBJ_RELEASE(proct); } break; diff --git a/orte/mca/iof/orted/iof_orted_receive.c b/orte/mca/iof/orted/iof_orted_receive.c index 7585b951a0..7ecae6e16d 100644 --- a/orte/mca/iof/orted/iof_orted_receive.c +++ b/orte/mca/iof/orted/iof_orted_receive.c @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -83,9 +85,10 @@ void orte_iof_orted_send_xonxoff(orte_iof_tag_t tag) * * (b) flow control messages */ -static void process_msg(int fd, short event, void *cbdata) +void orte_iof_orted_recv(int status, orte_process_name_t* sender, + opal_buffer_t* buffer, orte_rml_tag_t tag, + void* cbdata) { - orte_message_event_t *mev = (orte_message_event_t*)cbdata; unsigned char data[ORTE_IOF_BASE_MSG_MAX]; orte_iof_tag_t stream; int32_t count, numbytes; @@ -95,7 +98,7 @@ static void process_msg(int fd, short event, void *cbdata) /* see what stream generated this data */ count = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(mev->buffer, &stream, &count, ORTE_IOF_TAG))) { + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &stream, &count, ORTE_IOF_TAG))) { ORTE_ERROR_LOG(rc); goto CLEAN_RETURN; } @@ -108,14 +111,14 @@ static void process_msg(int fd, short event, void *cbdata) /* unpack the intended target */ count = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(mev->buffer, &target, &count, ORTE_NAME))) { + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &target, &count, ORTE_NAME))) { ORTE_ERROR_LOG(rc); goto CLEAN_RETURN; } /* unpack the data */ numbytes=ORTE_IOF_BASE_MSG_MAX; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(mev->buffer, data, &numbytes, OPAL_BYTE))) { + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, data, &numbytes, OPAL_BYTE))) { ORTE_ERROR_LOG(rc); goto CLEAN_RETURN; } @@ -163,31 +166,5 @@ static void process_msg(int fd, short event, void *cbdata) } CLEAN_RETURN: - /* release the message event */ - OBJ_RELEASE(mev); - return; -} - - -void orte_iof_orted_recv(int status, orte_process_name_t* sender, - opal_buffer_t* buffer, orte_rml_tag_t tag, - void* cbdata) -{ - OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, - "%s iof:orted:receive got message from %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(sender))); - - /* don't process this right away - we need to get out of the recv before - * we process the message to avoid performing the rest of the job while - * inside this receive! Instead, setup an event so that the message gets processed - * as soon as we leave the recv. - * - * The macro makes a copy of the buffer, which we release above - the incoming - * buffer, however, is NOT released here, although its payload IS transferred - * to the message buffer for later processing - */ - ORTE_MESSAGE_EVENT(sender, buffer, tag, process_msg); - return; } diff --git a/orte/mca/iof/tool/iof_tool.c b/orte/mca/iof/tool/iof_tool.c index ea62d6e679..b170d13ac4 100644 --- a/orte/mca/iof/tool/iof_tool.c +++ b/orte/mca/iof/tool/iof_tool.c @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -73,7 +75,7 @@ static int init(void) from the HNP IOF component */ if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_IOF_PROXY, - ORTE_RML_NON_PERSISTENT, + ORTE_RML_PERSISTENT, orte_iof_tool_recv, NULL))) { ORTE_ERROR_LOG(rc); @@ -217,9 +219,6 @@ static int tool_close(const orte_process_name_t* src_name, orte_rml.send_buffer_nb(&hnp, buf, ORTE_RML_TAG_IOF_HNP, 0, send_cb, NULL); - /* wait right here until the close is confirmed */ - ORTE_PROGRESSED_WAIT(mca_iof_tool_component.closed, 0, 1); - return ORTE_SUCCESS; } diff --git a/orte/mca/iof/tool/iof_tool_receive.c b/orte/mca/iof/tool/iof_tool_receive.c index a83d4b8586..95019e3703 100644 --- a/orte/mca/iof/tool/iof_tool_receive.c +++ b/orte/mca/iof/tool/iof_tool_receive.c @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -40,9 +42,10 @@ #include "iof_tool.h" -static void process_msg(int fd, short event, void *cbdata) +void orte_iof_tool_recv(int status, orte_process_name_t* sender, + opal_buffer_t* buffer, orte_rml_tag_t tag, + void* cbdata) { - orte_message_event_t *mev = (orte_message_event_t*)cbdata; orte_process_name_t origin; unsigned char data[ORTE_IOF_BASE_MSG_MAX]; orte_iof_tag_t stream; @@ -52,7 +55,7 @@ static void process_msg(int fd, short event, void *cbdata) /* unpack the stream first as this may be flow control info */ count = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(mev->buffer, &stream, &count, ORTE_IOF_TAG))) { + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &stream, &count, ORTE_IOF_TAG))) { ORTE_ERROR_LOG(rc); goto CLEAN_RETURN; } @@ -64,21 +67,21 @@ static void process_msg(int fd, short event, void *cbdata) OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, "%s received CLOSE handshake from remote hnp %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&mev->sender))); + ORTE_NAME_PRINT(sender))); mca_iof_tool_component.closed = true; goto CLEAN_RETURN; } /* get name of the process whose io we are receiving */ count = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(mev->buffer, &origin, &count, ORTE_NAME))) { + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &origin, &count, ORTE_NAME))) { ORTE_ERROR_LOG(rc); goto CLEAN_RETURN; } /* unpack the data */ numbytes=ORTE_IOF_BASE_MSG_MAX; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(mev->buffer, data, &numbytes, OPAL_BYTE))) { + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, data, &numbytes, OPAL_BYTE))) { ORTE_ERROR_LOG(rc); goto CLEAN_RETURN; } @@ -102,40 +105,5 @@ static void process_msg(int fd, short event, void *cbdata) } CLEAN_RETURN: - /* release the message event */ - OBJ_RELEASE(mev); - return; -} - -void orte_iof_tool_recv(int status, orte_process_name_t* sender, - opal_buffer_t* buffer, orte_rml_tag_t tag, - void* cbdata) -{ - int rc; - - OPAL_OUTPUT_VERBOSE((5, orte_iof_base.iof_output, - "%s iof:tool:receive got message from %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(sender))); - - /* don't process this right away - we need to get out of the recv before - * we process the message to avoid performing the rest of the job while - * inside this receive! Instead, setup an event so that the message gets processed - * as soon as we leave the recv. - * - * The macro makes a copy of the buffer, which we release above - the incoming - * buffer, however, is NOT released here, although its payload IS transferred - * to the message buffer for later processing - */ - ORTE_MESSAGE_EVENT(sender, buffer, tag, process_msg); - - /* reissue the recv */ - if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, - ORTE_RML_TAG_IOF_PROXY, - ORTE_RML_NON_PERSISTENT, - orte_iof_tool_recv, - NULL))) { - ORTE_ERROR_LOG(rc); - } return; } diff --git a/orte/mca/notifier/base/notifier_base_select.c b/orte/mca/notifier/base/notifier_base_select.c index 3d3f725e80..8f0286e484 100644 --- a/orte/mca/notifier/base/notifier_base_select.c +++ b/orte/mca/notifier/base/notifier_base_select.c @@ -26,7 +26,6 @@ #include "opal/mca/mca.h" #include "opal/util/argv.h" -#include "opal/util/opal_sos.h" #include "opal/mca/base/base.h" #include "opal/util/output.h" @@ -44,7 +43,6 @@ bool orte_notifier_base_help_selected = false; bool orte_notifier_base_log_peer_selected = false; bool orte_notifier_base_log_event_selected = false; -static opal_sos_reporter_callback_fn_t prev_reporter_callback; static inline char **orte_notifier_get_include_list(const char *, const char *, char **); @@ -207,8 +205,8 @@ int orte_notifier_base_select(void) if (NULL != nmodule->init) { /* If the module doesn't want to be used, skip it */ if (ORTE_SUCCESS != (ret = nmodule->init()) ) { - if (ORTE_ERR_NOT_SUPPORTED != OPAL_SOS_GET_ERROR_CODE(ret) && - ORTE_ERR_NOT_IMPLEMENTED != OPAL_SOS_GET_ERROR_CODE(ret)) { + if (ORTE_ERR_NOT_SUPPORTED != ret && + ORTE_ERR_NOT_IMPLEMENTED != ret) { exit_status = ret; goto cleanup; } @@ -293,11 +291,6 @@ int orte_notifier_base_select(void) orte_notifier_base_events_init(); } - /* Register a callback with OPAL SOS so that we can intercept - * error messages */ - opal_sos_reg_reporter_callback((opal_sos_reporter_callback_fn_t) orte_notifier_log, - &prev_reporter_callback); - cleanup: return exit_status; } diff --git a/orte/mca/notifier/hnp/Makefile.am b/orte/mca/notifier/hnp/Makefile.am index 91fba4047b..5f6fa2b22b 100644 --- a/orte/mca/notifier/hnp/Makefile.am +++ b/orte/mca/notifier/hnp/Makefile.am @@ -10,6 +10,8 @@ # Copyright (c) 2004-2005 The Regents of the University of California. # All rights reserved. # Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2012 Los Alamos National Security, LLC. +# All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -17,6 +19,8 @@ # $HEADER$ # +EXTRA_DIST = orte_notifier_hnp.txt + sources = \ notifier_hnp.h \ notifier_hnp_module.c \ diff --git a/orte/mca/notifier/hnp/notifier_hnp.h b/orte/mca/notifier/hnp/notifier_hnp.h index fa683919d9..30c5879910 100644 --- a/orte/mca/notifier/hnp/notifier_hnp.h +++ b/orte/mca/notifier/hnp/notifier_hnp.h @@ -33,10 +33,6 @@ BEGIN_C_DECLS void orte_notifier_hnp_recv_cb(int status, orte_process_name_t* sender, opal_buffer_t* buffer, orte_rml_tag_t tag, void* cbdata); -#if OPAL_ENABLE_DEBUG -void orte_notifier_hnp_exception_cb(const orte_process_name_t* peer, - orte_rml_exception_t reason); -#endif /* extern opal_pointer_array_t orte_notifier_hnp_tables; diff --git a/orte/mca/notifier/hnp/notifier_hnp_module.c b/orte/mca/notifier/hnp/notifier_hnp_module.c index 6b3cc6e834..8c4e9e026e 100644 --- a/orte/mca/notifier/hnp/notifier_hnp_module.c +++ b/orte/mca/notifier/hnp/notifier_hnp_module.c @@ -11,7 +11,9 @@ * All rights reserved. * Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. - * $COPYRIGHT$ + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights + * reserved. + * $COPYRIGHT$ * * Additional copyrights may follow * @@ -33,7 +35,6 @@ #endif #include "opal/util/show_help.h" -#include "opal/util/opal_sos.h" #include "opal/dss/dss.h" #include "opal/dss/dss_types.h" @@ -111,134 +112,6 @@ static int send_command(orte_notifier_base_severity_t severity, int errcode, return ORTE_SUCCESS; } -#if 0 -/** - * Function to pack a single SOS error entry. - * - * @return OPAL_SUCCESS Upon success - */ -static int opal_dss_pack_sos_error(opal_buffer_t *buf, opal_sos_error_t *error) -{ - int rc; - if (NULL == error) { - return ORTE_ERROR; - } - - /* Pack errnum */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &error->errnum, 1, OPAL_INT))) { - return rc; - } - - /* Pack the file name in which the error occurred */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, error->file, 1, OPAL_STRING))) { - return rc; - } - - /* Pack the line number on which the error was encountered */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &error->line, 1, OPAL_INT))) { - return rc; - } - - /* Pack the function name (if any) */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, error->func, 1, OPAL_STRING))) { - return rc; - } - - /* Pack the error message */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, error->msg, 1, OPAL_STRING))) { - return rc; - } - - /* Pack the pointer to the previous opal sos error object in the - opal sos table */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &error->prev, 1, OPAL_INT))) { - return rc; - } - - /* Pack the pointer to the next error */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &error->next, 1, OPAL_INT))) { - return rc; - } - - return ORTE_SUCCESS; -} - -/** - * Function to pack all the entries in the SOS table and send it - * over to the HNP. - * - * @return OPAL_SUCCESS Upon success - * @return OPAL_FAILURE Upon failure - * - * ADK: Presently, we simply rely on orte_show_help to do the aggregation on - * a per-error basis. - */ -static int opal_sos_send_table(void) -{ - opal_sos_error_t *opal_error; - opal_buffer_t *buf; - uint32_t key; - int rc; - size_t table_size; - void *prev_error, *next_error; - next_error = NULL; - - buf = OBJ_NEW(opal_buffer_t); - if (NULL == buf) { - return ORTE_ERR_OUT_OF_RESOURCE; - } - - OPAL_THREAD_LOCK(&opal_sos_table_lock); - table_size = opal_hash_table_get_size(&opal_sos_table); - - /* Pack the size of the SOS error table */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &table_size, 1, OPAL_SIZE))) { - ORTE_ERROR_LOG(rc); - goto error; - } - - if (OPAL_SUCCESS != opal_hash_table_get_first_key_uint32(&opal_sos_table, - &key, (void**)&opal_error, - &prev_error)) { - rc = ORTE_ERROR; - goto error; - } - - /* Pack the sos error object */ - if (ORTE_SUCCESS != (rc = opal_dss_pack_sos_error(buf, opal_error))) { - ORTE_ERROR_LOG(rc); - goto error; - } - - while (OPAL_SUCCESS == opal_hash_table_get_next_key_uint32(&opal_sos_table, - &key, (void**)&opal_error, - &prev_error, &next_error)) - { - if (ORTE_SUCCESS != (rc = opal_dss_pack_sos_error(buf, opal_error))) { - ORTE_ERROR_LOG(rc); - goto error; - } - } - OPAL_THREAD_UNLOCK(&opal_sos_table_lock); - - /* Now send the buffer (rc = number of bytes sent) */ - rc = orte_rml.send_buffer(ORTE_PROC_MY_HNP, buf, - ORTE_RML_TAG_NOTIFIER_HNP, 0); - if (rc <= 0) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(buf); - return rc; - } - - return ORTE_SUCCESS; - -error: - OPAL_THREAD_UNLOCK(&opal_sos_table_lock); - OBJ_RELEASE(buf); - return rc; -} -#endif - static int init(void) { int rc; @@ -248,23 +121,12 @@ static int init(void) if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_NOTIFIER_HNP, - ORTE_RML_NON_PERSISTENT, + ORTE_RML_PERSISTENT, orte_notifier_hnp_recv_cb, NULL))) { ORTE_ERROR_LOG(rc); return rc; } - -#if OPAL_ENABLE_DEBUG - /* If we're debugging, also add an exception handler -- just to - watch for problems in the RML */ - if (ORTE_SUCCESS != - (rc = orte_rml.add_exception_handler(orte_notifier_hnp_exception_cb))) { - ORTE_ERROR_LOG(rc); - orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_NOTIFIER_HNP); - return rc; - } -#endif } return ORTE_SUCCESS; @@ -289,7 +151,7 @@ static void mylog(orte_notifier_base_severity_t severity, int errcode, if (NULL != output) { if (ORTE_PROC_IS_HNP) { /* output it locally */ - orte_show_help("opal_sos_reporter.txt", "notifier message", false, output); + orte_show_help("orte_notifier_hnp.txt", "notifier message", false, output); } else { send_command(severity, errcode, output); } @@ -307,7 +169,7 @@ static void myhelplog(orte_notifier_base_severity_t severity, int errcode, if (NULL != output) { if (ORTE_PROC_IS_HNP) { /* output it locally */ - orte_show_help("opal_sos_reporter.txt", "notifier message", false, output); + orte_show_help("orte_notifier_hnp.txt", "notifier message", false, output); } else { send_command(severity, errcode, output); } @@ -324,7 +186,7 @@ static void mypeerlog(orte_notifier_base_severity_t severity, int errcode, if (NULL != buf) { if (ORTE_PROC_IS_HNP) { /* output it locally */ - orte_show_help("opal_sos_reporter.txt", "notifier message", false, buf); + orte_show_help("orte_notifier_hnp.txt", "notifier message", false, buf); } else { send_command(severity, errcode, buf); } @@ -336,7 +198,7 @@ static void myeventlog(const char *msg) { if (ORTE_PROC_IS_HNP) { /* output it locally */ - orte_show_help("opal_sos_reporter.txt", "notifier message", false, (char*)msg); + orte_show_help("orte_notifier_hnp.txt", "notifier message", false, (char*)msg); } else { send_command(ORTE_NOTIFIER_NOTICE, ORTE_SUCCESS, (char *)msg); } diff --git a/orte/mca/notifier/hnp/notifier_hnp_recv.c b/orte/mca/notifier/hnp/notifier_hnp_recv.c index 6a770f84cc..c4f640c08a 100644 --- a/orte/mca/notifier/hnp/notifier_hnp_recv.c +++ b/orte/mca/notifier/hnp/notifier_hnp_recv.c @@ -11,7 +11,9 @@ * All rights reserved. * Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. - * $COPYRIGHT$ + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights + * reserved. + * $COPYRIGHT$ * * Additional copyrights may follow * @@ -25,18 +27,14 @@ #include "orte/mca/notifier/base/base.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/util/show_help.h" -#include "opal/util/opal_sos.h" #include "opal/class/opal_hash_table.h" #include "notifier_hnp.h" -/* - * This function is called back *after* the RML receive callback to - * avoid the RRD ("receive recursion of death"). - */ -static void process_msg(int fd, short event, void *cbdata) +void orte_notifier_hnp_recv_cb(int status, orte_process_name_t* sender, + opal_buffer_t* buffer, orte_rml_tag_t tag, + void* cbdata) { - orte_message_event_t *mev = (orte_message_event_t*)cbdata; uint8_t u8; uint32_t u32; int rc, count; @@ -47,7 +45,7 @@ static void process_msg(int fd, short event, void *cbdata) /* Unpack the severity */ count = 1; if (ORTE_SUCCESS != - (rc = opal_dss.unpack(mev->buffer, &u8, &count, OPAL_UINT8))) { + (rc = opal_dss.unpack(buffer, &u8, &count, OPAL_UINT8))) { ORTE_ERROR_LOG(rc); goto CLEAN_RETURN; } @@ -56,7 +54,7 @@ static void process_msg(int fd, short event, void *cbdata) /* Unpack the errcode */ count = 1; if (ORTE_SUCCESS != - (rc = opal_dss.unpack(mev->buffer, &u32, &count, OPAL_UINT32))) { + (rc = opal_dss.unpack(buffer, &u32, &count, OPAL_UINT32))) { ORTE_ERROR_LOG(rc); goto CLEAN_RETURN; } @@ -65,207 +63,14 @@ static void process_msg(int fd, short event, void *cbdata) /* Unpack the string */ count = 1; if (ORTE_SUCCESS != - (rc = opal_dss.unpack(mev->buffer, &msg, &count, OPAL_STRING))) { + (rc = opal_dss.unpack(buffer, &msg, &count, OPAL_STRING))) { ORTE_ERROR_LOG(rc); goto CLEAN_RETURN; } - orte_show_help("opal_sos_reporter.txt", "notifier message", false, msg); + orte_show_help("orte_notifier_hnp.txt", "notifier message", false, msg); CLEAN_RETURN: - /* release the message event */ - OBJ_RELEASE(mev); return; } -#if 0 -/** - * Function to unpack a single SOS error entry. - * - * @return OPAL_SUCCESS Upon success - */ -static int opal_dss_unpack_sos_error(opal_buffer_t *buf, opal_sos_error_t *error) -{ - int count, rc; - if (NULL == error) { - return ORTE_ERROR; - } - - /* Unpack the errcode */ - count = 1; - if (ORTE_SUCCESS != - (rc = opal_dss.unpack(buf, &error->errnum, &count, OPAL_INT))) { - return rc; - } - - /* Unpack the filename */ - count = 1; - if (ORTE_SUCCESS != - (rc = opal_dss.unpack(buf, error->file, &count, OPAL_STRING))) { - return rc; - } - - /* Unpack the line number */ - count = 1; - if (ORTE_SUCCESS != - (rc = opal_dss.unpack(buf, &error->line, &count, OPAL_INT))) { - return rc; - } - - /* Unpack the function name */ - count = 1; - if (ORTE_SUCCESS != - (rc = opal_dss.unpack(buf, error->func, &count, OPAL_STRING))) { - return rc; - } - - /* Unpack the error message */ - count = 1; - if (ORTE_SUCCESS != - (rc = opal_dss.unpack(buf, error->msg, &count, OPAL_STRING))) { - return rc; - } - - /* Unpack the pointer to the previous error */ - count = 1; - if (ORTE_SUCCESS != - (rc = opal_dss.unpack(buf, &error->prev, &count, OPAL_INT))) { - return rc; - } - - /* Unpack the pointer to the next error */ - count = 1; - if (ORTE_SUCCESS != - (rc = opal_dss.unpack(buf, &error->next, &count, OPAL_INT))) { - return rc; - } - - return ORTE_SUCCESS; -} - -/* - * Function to unpack the entire SOS table on the HNP. - */ -static void process_sos_table_msg(int fd, short event, void *cbdata) -{ - orte_message_event_t *mev = (orte_message_event_t*)cbdata; - size_t table_size; - int i, rc = ORTE_SUCCESS, count, numerrors; - opal_sos_error_t *opal_error; - opal_hash_table_t *sos_table, *old_sos_table; - - /* Allocate a new SOS table */ - sos_table = OBJ_NEW(opal_hash_table_t); - if (NULL == sos_table) { - ORTE_ERROR_LOG(OPAL_ERR_OUT_OF_RESOURCE); - OBJ_RELEASE(mev); - return; - } - - /* Unpack the size of the SOS table */ - count = 1; - if (ORTE_SUCCESS != - (rc = opal_dss.unpack(mev->buffer, &table_size, &count, OPAL_SIZE))) { - goto error; - } - numerrors = (int) table_size; - - /* Initialize the SOS table */ - opal_hash_table_init(sos_table, table_size); - - for (i = 0; i < numerrors; i++) { - - opal_error = OBJ_NEW(opal_sos_error_t); - if (NULL == opal_error) { - rc = OPAL_ERR_OUT_OF_RESOURCE; - goto error; - } - - if (ORTE_SUCCESS != - (rc = opal_dss_unpack_sos_error(mev->buffer, opal_error))) { - goto error; - } - - opal_hash_table_set_value_uint32(sos_table, - opal_error->errnum, - (void *)opal_error); - } - - /* Add this SOS table to the list of SOS tables. - If it already exists, we destroy the old table - and set the new one as the current SOS table. */ - OPAL_THREAD_LOCK(&orte_notifier_hnp_tables_lock); - if (false == - opal_pointer_array_test_and_set_item(&orte_notifier_hnp_tables, - mev->sender.vpid, - (void *)sos_table)) { - old_sos_table = opal_pointer_array_get_item(&orte_notifier_hnp_tables, - mev->sender.vpid); - OBJ_DESTRUCT(old_sos_table); - old_sos_table = NULL; - opal_pointer_array_set_item(&orte_notifier_hnp_tables, - mev->sender.vpid, - (void *)sos_table); - } - OPAL_THREAD_UNLOCK(&orte_notifier_hnp_tables_lock); - OBJ_RELEASE(mev); - return; - -error: - ORTE_ERROR_LOG(rc); - /* release the message event */ - OBJ_RELEASE(mev); - - /* destroy the sos table */ - OBJ_DESTRUCT(sos_table); - return; -} -#endif - -void orte_notifier_hnp_recv_cb(int status, orte_process_name_t* sender, - opal_buffer_t* buffer, orte_rml_tag_t tag, - void* cbdata) -{ - int rc; - - OPAL_OUTPUT_VERBOSE((5, orte_notifier_base_output, - "%s notifier:hnp:receive got message from %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(sender))); - - /* Don't process the message right away - remember that we're in a - * callback during the actual RML receive! We need to get out of - * the receive before we process the message to avoid performing - * the rest of the job while still inside this receive. Instead, - * setup an event so that the message gets processed as soon as we - * leave the receive. This avoids the "receive recursion of - * death" scenarios. - * - * The ORTE_MESSAGE_EVENT macro makes a copy of the buffer, which - * we release in the process_msg() callback - the incoming buffer, - * however, is NOT released here, although its payload IS - * transferred to the message buffer for later processing. - */ - ORTE_MESSAGE_EVENT(sender, buffer, tag, process_msg); - - /* reissue the receive, since it is non-persistent */ - if (ORTE_SUCCESS != - (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, - ORTE_RML_TAG_NOTIFIER_HNP, - ORTE_RML_NON_PERSISTENT, - orte_notifier_hnp_recv_cb, - NULL))) { - ORTE_ERROR_LOG(rc); - } -} - - -#if OPAL_ENABLE_DEBUG -void orte_notifier_hnp_exception_cb(const orte_process_name_t* peer, - orte_rml_exception_t reason) -{ - opal_output(orte_notifier_base_output, - "Notifier HNP RML receive exception from %s", - ORTE_NAME_PRINT((orte_process_name_t*)peer)); -} -#endif diff --git a/opal/util/opal_sos_reporter.txt b/orte/mca/notifier/hnp/orte_notifier_hnp.txt similarity index 94% rename from opal/util/opal_sos_reporter.txt rename to orte/mca/notifier/hnp/orte_notifier_hnp.txt index 8b68da7602..4377e9d7fe 100644 --- a/opal/util/opal_sos_reporter.txt +++ b/orte/mca/notifier/hnp/orte_notifier_hnp.txt @@ -17,7 +17,7 @@ # # $HEADER$ # -# This is the US/English help file for OPAL SOS error messages. +# This is the US/English help file for HNP notifier messages. # # FORMAT: # filename:linenum:functionname diff --git a/orte/mca/notifier/notifier.h b/orte/mca/notifier/notifier.h index 8c08581592..feccbde9ee 100644 --- a/orte/mca/notifier/notifier.h +++ b/orte/mca/notifier/notifier.h @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2009 Cisco Systems, Inc. All Rights Reserved. + * Copyright (c) 2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -38,9 +40,14 @@ #ifdef HAVE_STDARG_H #include #endif +#ifdef HAVE_LIMITS_H +#include +#endif +#ifdef HAVE_SYSLOG_H +#include +#endif #include "opal/mca/mca.h" -#include "opal/util/opal_sos.h" #include "orte/constants.h" #include "orte/types.h" @@ -56,16 +63,16 @@ BEGIN_C_DECLS */ #define ORTE_NOTIFIER_MAX_BUF 512 -/* Severities, based on OPAL SOS */ +/* Severities */ typedef enum { - ORTE_NOTIFIER_EMERG = OPAL_SOS_SEVERITY_EMERG, - ORTE_NOTIFIER_ALERT = OPAL_SOS_SEVERITY_ALERT, - ORTE_NOTIFIER_CRIT = OPAL_SOS_SEVERITY_CRIT, - ORTE_NOTIFIER_ERROR = OPAL_SOS_SEVERITY_ERROR, - ORTE_NOTIFIER_WARN = OPAL_SOS_SEVERITY_WARN, - ORTE_NOTIFIER_NOTICE = OPAL_SOS_SEVERITY_NOTICE, - ORTE_NOTIFIER_INFO = OPAL_SOS_SEVERITY_INFO, - ORTE_NOTIFIER_DEBUG = OPAL_SOS_SEVERITY_DEBUG + ORTE_NOTIFIER_EMERG = LOG_EMERG, + ORTE_NOTIFIER_ALERT = LOG_ALERT, + ORTE_NOTIFIER_CRIT = LOG_CRIT, + ORTE_NOTIFIER_ERROR = LOG_ERR, + ORTE_NOTIFIER_WARN = LOG_WARNING, + ORTE_NOTIFIER_NOTICE = LOG_NOTICE, + ORTE_NOTIFIER_INFO = LOG_INFO, + ORTE_NOTIFIER_DEBUG = LOG_DEBUG } orte_notifier_base_severity_t; /* diff --git a/orte/mca/odls/base/base.h b/orte/mca/odls/base/base.h index a3fa7a05ea..13d5000e6b 100644 --- a/orte/mca/odls/base/base.h +++ b/orte/mca/odls/base/base.h @@ -76,16 +76,9 @@ ORTE_DECLSPEC int orte_odls_base_select(void); ORTE_DECLSPEC int orte_odls_base_finalize(void); ORTE_DECLSPEC int orte_odls_base_close(void); -/* proc termination entry points */ -ORTE_DECLSPEC void orte_odls_base_notify_iof_complete(orte_process_name_t *proc); -ORTE_DECLSPEC void orte_base_default_waitpid_fired(orte_process_name_t *proc, int32_t status); - /* setup singleton job data */ ORTE_DECLSPEC void orte_odls_base_setup_singleton_jobdat(orte_jobid_t jobid); -/* Lookup function to see if the child process has already finished. */ -ORTE_DECLSPEC bool orte_odls_base_default_check_finished(orte_process_name_t *proc); - #endif /* ORTE_DISABLE_FULL_SUPPORT */ END_C_DECLS diff --git a/orte/mca/odls/base/odls_base_close.c b/orte/mca/odls/base/odls_base_close.c index 81bac5e442..3f12632050 100644 --- a/orte/mca/odls/base/odls_base_close.c +++ b/orte/mca/odls/base/odls_base_close.c @@ -9,6 +9,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -24,7 +26,7 @@ #include "opal/mca/mca.h" #include "opal/mca/base/base.h" #include "opal/class/opal_list.h" -#include "opal/threads/threads.h" +#include "opal/class/opal_pointer_array.h" #include "orte/mca/odls/odls.h" #include "orte/mca/odls/base/base.h" @@ -33,11 +35,11 @@ int orte_odls_base_close(void) { + int i; + orte_proc_t *proc; opal_list_item_t *item; - + /* cleanup ODLS globals */ - OBJ_DESTRUCT(&orte_odls_globals.mutex); - OBJ_DESTRUCT(&orte_odls_globals.cond); while (NULL != (item = opal_list_remove_first(&orte_odls_globals.xterm_ranks))) { OBJ_RELEASE(item); } @@ -48,19 +50,13 @@ int orte_odls_base_close(void) } /* cleanup the global list of local children and job data */ - while (NULL != (item = opal_list_remove_first(&orte_local_children))) { - OBJ_RELEASE(item); + for (i=0; i < orte_local_children->size; i++) { + if (NULL != (proc = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { + OBJ_RELEASE(proc); + } } - OBJ_DESTRUCT(&orte_local_children); - OBJ_DESTRUCT(&orte_local_children_lock); - OBJ_DESTRUCT(&orte_local_children_cond); - while (NULL != (item = opal_list_remove_first(&orte_local_jobdata))) { - OBJ_RELEASE(item); - } - OBJ_DESTRUCT(&orte_local_jobdata); - OBJ_DESTRUCT(&orte_local_jobdata_lock); - OBJ_DESTRUCT(&orte_local_jobdata_cond); - + OBJ_RELEASE(orte_local_children); + /* if no components are available, then punt */ if (!orte_odls_base.components_available) { return ORTE_SUCCESS; diff --git a/orte/mca/odls/base/odls_base_default_fns.c b/orte/mca/odls/base/odls_base_default_fns.c index 2bc7eccd48..848667f70c 100644 --- a/orte/mca/odls/base/odls_base_default_fns.c +++ b/orte/mca/odls/base/odls_base_default_fns.c @@ -11,9 +11,9 @@ * All rights reserved. * Copyright (c) 2007-2011 Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. - * Copyright (c) 2011 Los Alamos National Security, LLC. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. * All rights reserved. - * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -58,10 +58,12 @@ #include "orte/mca/iof/iof.h" #include "orte/mca/iof/base/iof_base_setup.h" #include "orte/mca/ess/base/base.h" +#include "orte/mca/grpcomm/base/base.h" #include "orte/mca/plm/base/base.h" #include "orte/mca/routed/base/base.h" #include "orte/mca/rmaps/rmaps_types.h" #include "orte/mca/sensor/sensor.h" +#include "orte/mca/state/state.h" #include "orte/util/context_fns.h" #include "orte/util/name_fns.h" @@ -101,7 +103,6 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data, int32_t numbytes; int8_t flag; int j; - orte_daemon_cmd_flag_t command; orte_app_context_t *app; if (NULL != orte_debugger_daemon && ORTE_JOBID_INVALID == job) { @@ -153,12 +154,6 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data, /* if anything was inserted, put it in a byte object for xmission */ if (0 < wireup->bytes_used) { opal_dss.unload(wireup, (void**)&bo.bytes, &numbytes); - /* pack the number of bytes required by payload */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &numbytes, 1, OPAL_INT32))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(wireup); - return rc; - } /* pack the byte object */ bo.size = numbytes; boptr = &bo; @@ -185,15 +180,6 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data, opal_dss.pack(data, &flag, 1, OPAL_INT8); } - /* insert an "add-procs" command here so we can cleanly process it on the - * other end - */ - command = ORTE_DAEMON_ADD_LOCAL_PROCS; - if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &command, 1, ORTE_DAEMON_CMD))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* are we co-locating debugger daemons? */ if (NULL != orte_debugger_daemon) { /* flag that we are */ @@ -315,6 +301,10 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data, ORTE_ERROR_LOG(rc); return rc; } + /* save it on the job data object as we won't be unpacking the buffer + * on our end + */ + opal_dss.copy((void**)&jdata->pmap, &bo, OPAL_BYTE_OBJECT); /* release the data since it has now been copied into our buffer */ free(bo.bytes); @@ -332,84 +322,55 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data, return ORTE_SUCCESS; } -int orte_odls_base_default_update_daemon_info(opal_buffer_t *data) +static int check_local_proc(orte_job_t *jdata, orte_proc_t *pptr) { - opal_buffer_t wireup; - opal_byte_object_t *bo; - int rc; - orte_std_cntr_t cnt; - int32_t numbytes; - int8_t flag; + orte_vpid_t host_daemon; + orte_app_context_t *app; - /* extract the byte object holding the daemonmap */ - cnt=1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &bo, &cnt, OPAL_BYTE_OBJECT))) { - ORTE_ERROR_LOG(rc); - return rc; + /* get the vpid of the daemon that is to host this proc */ + OPAL_OUTPUT_VERBOSE((20, orte_odls_globals.output, + "%s odls:constructing child list - looking for daemon for proc %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pptr->name))); + if (ORTE_VPID_INVALID == (host_daemon = orte_ess.proc_get_daemon(&pptr->name))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + return ORTE_ERR_NOT_FOUND; } - /* retain a copy for downloading to child processes */ - if (NULL != orte_odls_globals.dmap) { - free(orte_odls_globals.dmap->bytes); - free(orte_odls_globals.dmap); - orte_odls_globals.dmap = NULL; + + OPAL_OUTPUT_VERBOSE((20, orte_odls_globals.output, + "%s odls:constructing child list - checking proc %s on daemon %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pptr->name), + ORTE_VPID_PRINT(host_daemon))); + + /* does this proc belong to us? */ + if (ORTE_PROC_MY_NAME->vpid != host_daemon) { + return ORTE_SUCCESS; } - opal_dss.copy((void**)&orte_odls_globals.dmap, bo, OPAL_BYTE_OBJECT); - - /* update our local nidmap, if required - the decode function - * knows what to do - it will also free the bytes in the bo - */ - OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, - "%s odls:update:daemon:info updating nidmap", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - if (ORTE_SUCCESS != (rc = orte_ess.update_nidmap(bo))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* update the routing tree */ - if (ORTE_SUCCESS != (rc = orte_routed.update_routing_tree(ORTE_PROC_MY_NAME->jobid))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /* see if we have wiring info as well */ - cnt=1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &flag, &cnt, OPAL_INT8))) { - ORTE_ERROR_LOG(rc); - return rc; - } - if (0 == flag) { - /* no - just return */ - return rc; + + OPAL_OUTPUT_VERBOSE((10, orte_odls_globals.output, + "%s odls:constructing child list - found proc %s for me!", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pptr->name))); + + /* is this child on our current list of children */ + if (!pptr->local_proc) { + /* not on the local list */ + OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, + "adding proc %s to my local list", + ORTE_NAME_PRINT(&pptr->name))); + /* keep tabs of the number of local procs */ + jdata->num_local_procs++; + /* add this proc to our child list */ + OBJ_RETAIN(pptr); + pptr->local_proc = true; + opal_pointer_array_add(orte_local_children, pptr); } - /* unpack the #bytes of daemon wireup info in the message */ - cnt=1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &numbytes, &cnt, OPAL_INT32))) { - ORTE_ERROR_LOG(rc); - return rc; + /* if the job is in restart mode, the child must not barrier when launched */ + if (ORTE_JOB_CONTROL_RESTART & jdata->controls) { + pptr->do_not_barrier = true; } - /* any bytes there? */ - if (0 < numbytes) { - /* unpack the byte object */ - cnt=1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &bo, &cnt, OPAL_BYTE_OBJECT))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* load it into a buffer */ - OBJ_CONSTRUCT(&wireup, opal_buffer_t); - opal_dss.load(&wireup, bo->bytes, bo->size); - /* pass it for processing */ - if (ORTE_SUCCESS != (rc = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, &wireup))) { - ORTE_ERROR_LOG(rc); - OBJ_DESTRUCT(&wireup); - return rc; - } - /* done with the buffer - dump it */ - OBJ_DESTRUCT(&wireup); - } - + /* mark that this app_context is being used on this node */ + app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, pptr->app_idx); + app->used_on_node = true; return ORTE_SUCCESS; } @@ -417,18 +378,15 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data, orte_jobid_t *job) { int rc; - orte_vpid_t j, host_daemon; - orte_odls_child_t *child; + orte_vpid_t j; orte_std_cntr_t cnt; - orte_odls_job_t *jobdat=NULL; + orte_job_t *jdata=NULL; opal_byte_object_t *bo; - opal_list_item_t *item; int8_t flag; orte_jobid_t debugger; - bool add_child; - orte_ns_cmp_bitmask_t mask; + int32_t n; orte_app_context_t *app; - orte_proc_t *pptr; + orte_proc_t *pptr, *p2; OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls:constructing child list", @@ -436,6 +394,41 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data, *job = ORTE_JOBID_INVALID; + /* extract the byte object holding the daemon map */ + cnt=1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &bo, &cnt, OPAL_BYTE_OBJECT))) { + ORTE_ERROR_LOG(rc); + goto REPORT_ERROR; + } + /* retain a copy for downloading to child processes */ + if (NULL != orte_odls_globals.dmap) { + free(orte_odls_globals.dmap->bytes); + free(orte_odls_globals.dmap); + orte_odls_globals.dmap = NULL; + } + orte_odls_globals.dmap = bo; + bo = NULL; + + /* unpack the wireup info flag */ + cnt=1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &flag, &cnt, OPAL_INT8))) { + ORTE_ERROR_LOG(rc); + goto REPORT_ERROR; + } + /* if it was given, unpack and discard it */ + if (0 != flag) { + /* unpack the byte object */ + cnt=1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &bo, &cnt, OPAL_BYTE_OBJECT))) { + ORTE_ERROR_LOG(rc); + goto REPORT_ERROR; + } + if (0 < bo->size) { + free(bo->bytes); + } + free(bo); + } + /* unpack the flag - are we co-locating debugger daemons? */ cnt=1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &flag, &cnt, OPAL_INT8))) { @@ -450,7 +443,7 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data, * worry about race conditions as the debugger daemons do not use * the daemon collective system */ - orte_odls_globals.debugger = OBJ_NEW(orte_odls_job_t); + orte_odls_globals.debugger = OBJ_NEW(orte_job_t); /* get the debugger daemon jobid */ cnt=1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &debugger, &cnt, ORTE_JOBID))) { @@ -460,7 +453,7 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data, orte_odls_globals.debugger->jobid = debugger; orte_odls_globals.debugger->num_apps = 1; orte_odls_globals.debugger->num_local_procs = 1; - opal_list_append(&orte_local_jobdata, &(orte_odls_globals.debugger)->super); + opal_pointer_array_set_item(orte_job_data, ORTE_LOCAL_JOBID(debugger), orte_odls_globals.debugger); /* retrieve the info */ cnt = 1; app = NULL; @@ -468,7 +461,7 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data, ORTE_ERROR_LOG(rc); goto REPORT_ERROR; } - opal_pointer_array_add(&orte_odls_globals.debugger->apps, app); + opal_pointer_array_add(orte_odls_globals.debugger->apps, app); cnt=1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &(orte_odls_globals.debugger->controls), &cnt, ORTE_JOB_CONTROL))) { ORTE_ERROR_LOG(rc); @@ -483,7 +476,7 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data, * launching debugger daemons */ if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER == rc) { - goto done; + goto COMPLETE; } *job = ORTE_JOBID_INVALID; ORTE_ERROR_LOG(rc); @@ -500,106 +493,113 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data, * to our unpacking add_local_procs. So lookup the job record for this jobid * and see if it already exists */ - for (item = opal_list_get_first(&orte_local_jobdata); - item != opal_list_get_end(&orte_local_jobdata); - item = opal_list_get_next(item)) { - orte_odls_job_t *jdat = (orte_odls_job_t*)item; - - /* is this the specified job? */ - if (jdat->jobid == *job) { - OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, - "%s odls:construct_child_list found existing jobdat for job %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(*job))); - jobdat = jdat; - break; - } - } - if (NULL == jobdat) { - /* setup jobdat object for this job */ + if (NULL == (jdata = orte_get_job_data_object(*job))) { + /* setup job object for this job */ OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, - "%s odls:construct_child_list adding new jobdat for job %s", + "%s odls:construct_child_list adding new object for job %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(*job))); - jobdat = OBJ_NEW(orte_odls_job_t); - jobdat->jobid = *job; - opal_list_append(&orte_local_jobdata, &jobdat->super); + jdata = OBJ_NEW(orte_job_t); + jdata->jobid = *job; + opal_pointer_array_set_item(orte_job_data, ORTE_LOCAL_JOBID(jdata->jobid), jdata); } + + /* if we are the HNP, we don't need to unpack this buffer - we already + * have all the required info in our local job array. So just build the + * array of local children + */ + if (ORTE_PROC_IS_HNP) { + for (n=0; n < jdata->procs->size; n++) { + if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, n))) { + continue; + } + if (ORTE_SUCCESS != (rc = check_local_proc(jdata, pptr))) { + ORTE_ERROR_LOG(rc); + goto REPORT_ERROR; + } + } + goto COMPLETE; + } + /* if we are doing a timing test, store the time the msg was recvd */ if (orte_timing) { - jobdat->launch_msg_recvd.tv_sec = orte_daemon_msg_recvd.tv_sec; - jobdat->launch_msg_recvd.tv_usec = orte_daemon_msg_recvd.tv_usec; + jdata->launch_msg_recvd.tv_sec = orte_daemon_msg_recvd.tv_sec; + jdata->launch_msg_recvd.tv_usec = orte_daemon_msg_recvd.tv_usec; } /* UNPACK JOB-SPECIFIC DATA */ /* unpack the job state so we can know if this is a restart vs initial launch */ cnt=1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->state, &cnt, ORTE_JOB_STATE))) { + if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jdata->state, &cnt, ORTE_JOB_STATE))) { *job = ORTE_JOBID_INVALID; ORTE_ERROR_LOG(rc); goto REPORT_ERROR; } /* unpack the number of nodes involved in this job */ + if (NULL == jdata->map) { + jdata->map = OBJ_NEW(orte_job_map_t); + } cnt=1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->num_nodes, &cnt, ORTE_STD_CNTR))) { + if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jdata->map->num_nodes, &cnt, ORTE_STD_CNTR))) { ORTE_ERROR_LOG(rc); goto REPORT_ERROR; } /* unpack the number of procs in this launch */ cnt=1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->num_procs, &cnt, ORTE_VPID))) { + if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jdata->num_procs, &cnt, ORTE_VPID))) { ORTE_ERROR_LOG(rc); goto REPORT_ERROR; } /* unpack the total slots allocated to us */ cnt=1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->total_slots_alloc, &cnt, ORTE_STD_CNTR))) { + if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jdata->total_slots_alloc, &cnt, ORTE_STD_CNTR))) { ORTE_ERROR_LOG(rc); goto REPORT_ERROR; } #if OPAL_HAVE_HWLOC /* unpack the binding policy */ cnt=1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->binding, &cnt, OPAL_BINDING_POLICY))) { + if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jdata->map->binding, &cnt, OPAL_BINDING_POLICY))) { ORTE_ERROR_LOG(rc); goto REPORT_ERROR; } #endif /* unpack the control flags for the job */ cnt=1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->controls, &cnt, ORTE_JOB_CONTROL))) { + if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jdata->controls, &cnt, ORTE_JOB_CONTROL))) { ORTE_ERROR_LOG(rc); goto REPORT_ERROR; } /* unpack the stdin target for the job */ cnt=1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->stdin_target, &cnt, ORTE_VPID))) { + if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jdata->stdin_target, &cnt, ORTE_VPID))) { ORTE_ERROR_LOG(rc); goto REPORT_ERROR; } /* unpack whether or not process recovery is allowed for this job */ cnt=1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->enable_recovery, &cnt, OPAL_BOOL))) { + if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jdata->enable_recovery, &cnt, OPAL_BOOL))) { ORTE_ERROR_LOG(rc); goto REPORT_ERROR; } /* unpack the number of app_contexts for this job */ cnt=1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->num_apps, &cnt, ORTE_APP_IDX))) { + if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jdata->num_apps, &cnt, ORTE_APP_IDX))) { ORTE_ERROR_LOG(rc); goto REPORT_ERROR; } OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls:construct_child_list unpacking %ld app_contexts", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (long)jobdat->num_apps)); - for (j=0; j < jobdat->num_apps; j++) { + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (long)jdata->num_apps)); + for (j=0; j < jdata->num_apps; j++) { cnt = 1; app = NULL; if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &app, &cnt, ORTE_APP_CONTEXT))) { ORTE_ERROR_LOG(rc); goto REPORT_ERROR; } - opal_pointer_array_set_item(&jobdat->apps, app->idx, app); + opal_pointer_array_set_item(jdata->apps, app->idx, app); } /* unpack the pidmap byte object */ @@ -609,11 +609,13 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data, goto REPORT_ERROR; } /* retain a copy for downloading to child processes */ - if (NULL != jobdat->pmap && NULL != jobdat->pmap->bytes) { - free(jobdat->pmap->bytes); - free(jobdat->pmap); + if (NULL != jdata->pmap) { + if (NULL != jdata->pmap->bytes) { + free(jdata->pmap->bytes); + } + free(jdata->pmap); } - opal_dss.copy((void**)&jobdat->pmap, bo, OPAL_BYTE_OBJECT); + opal_dss.copy((void**)&jdata->pmap, bo, OPAL_BYTE_OBJECT); /* decode the pidmap - this will also free the bytes in bo */ if (ORTE_SUCCESS != (rc = orte_ess.update_pidmap(bo))) { ORTE_ERROR_LOG(rc); @@ -621,115 +623,32 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data, } /* unpack the procs */ - for (j=0; j < jobdat->num_procs; j++) { + for (j=0; j < jdata->num_procs; j++) { cnt=1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &pptr, &cnt, ORTE_PROC))) { ORTE_ERROR_LOG(rc); goto REPORT_ERROR; } + /* add it to our global jdata object since + * many parts of the system will look for it there + */ + if (NULL != (p2 = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, pptr->name.vpid))) { + OBJ_RELEASE(p2); + } + opal_pointer_array_set_item(jdata->procs, pptr->name.vpid, pptr); - /* see if it is one of mine */ - ORTE_EPOCH_SET(proc.epoch,orte_ess.proc_get_epoch(&pptr->name)); - /* get the vpid of the daemon that is to host this proc */ - OPAL_OUTPUT_VERBOSE((20, orte_odls_globals.output, - "%s odls:constructing child list - looking for daemon for proc %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pptr->name))); - if (ORTE_VPID_INVALID == (host_daemon = orte_ess.proc_get_daemon(&pptr->name))) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - rc = ORTE_ERR_NOT_FOUND; + /* see if it belongs to us */ + if (ORTE_SUCCESS != (rc = check_local_proc(jdata, pptr))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(pptr); goto REPORT_ERROR; } - - OPAL_OUTPUT_VERBOSE((20, orte_odls_globals.output, - "%s odls:constructing child list - checking proc %s on daemon %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pptr->name), - ORTE_VPID_PRINT(host_daemon))); - - /* does this proc belong to us? */ - if (ORTE_PROC_MY_NAME->vpid == host_daemon) { - - OPAL_OUTPUT_VERBOSE((10, orte_odls_globals.output, - "%s odls:constructing child list - found proc %s for me!", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pptr->name))); - - add_child = true; - /* if this job is restarting procs, then we need to treat things - * a little differently. We may be adding a proc to our local - * children (if the proc moved here from somewhere else), or we - * may simply be restarting someone already here. - */ - if (ORTE_JOB_STATE_RESTART == jobdat->state) { - /* look for this job on our current list of children */ - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; - - mask = ORTE_NS_CMP_ALL; - - if (OPAL_EQUAL == - orte_util_compare_name_fields(mask, child->name, &pptr->name)) { - /* do not duplicate this child on the list! */ - OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, - "proc %s is on list and is %s", - ORTE_NAME_PRINT(&pptr->name), - (child->alive) ? "ALIVE" : "DEAD")); - add_child = false; - child->restarts = pptr->restarts; - child->do_not_barrier = true; - /* mark that this app_context is being used on this node */ - app = (orte_app_context_t*)opal_pointer_array_get_item(&jobdat->apps, pptr->app_idx); - app->used_on_node = true; - break; - } - } - } - - /* if we need to add the child, do so */ - if (add_child) { - OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, - "adding proc %s to my local list", - ORTE_NAME_PRINT(&pptr->name))); - /* keep tabs of the number of local procs */ - jobdat->num_local_procs++; - /* add this proc to our child list */ - child = OBJ_NEW(orte_odls_child_t); - /* copy the name to preserve it */ - if (ORTE_SUCCESS != (rc = opal_dss.copy((void**)&child->name, &pptr->name, ORTE_NAME))) { - ORTE_ERROR_LOG(rc); - goto REPORT_ERROR; - } - child->app_idx = pptr->app_idx; /* save the index into the app_context objects */ - child->restarts = pptr->restarts; - /* if the job is in restart mode, the child must not barrier when launched */ - if (ORTE_JOB_STATE_RESTART == jobdat->state) { - child->do_not_barrier = true; - } -#if OPAL_HAVE_HWLOC - if (NULL != pptr->cpu_bitmap) { - child->cpu_bitmap = strdup(pptr->cpu_bitmap); - } -#endif - /* mark that this app_context is being used on this node */ - app = (orte_app_context_t*)opal_pointer_array_get_item(&jobdat->apps, pptr->app_idx); - app->used_on_node = true; - /* protect operation on the global list of children */ - OPAL_THREAD_LOCK(&orte_odls_globals.mutex); - opal_list_append(&orte_local_children, &child->super); - opal_condition_signal(&orte_odls_globals.cond); - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); - } - } - OBJ_RELEASE(pptr); } - /* flag that the launch msg has been processed so daemon collectives can proceed */ - OPAL_THREAD_LOCK(&jobdat->lock); - jobdat->launch_msg_processed = true; - opal_condition_broadcast(&jobdat->cond); - OPAL_THREAD_UNLOCK(&jobdat->lock); + COMPLETE: + /* progress any pending collectives */ + orte_grpcomm_base_progress_collectives(); - done: return ORTE_SUCCESS; REPORT_ERROR: @@ -739,8 +658,7 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data, * for it to happen - especially so developers don't have to * deal with the hang! */ - orte_errmgr.update_state(*job, ORTE_JOB_STATE_NEVER_LAUNCHED, - NULL, ORTE_PROC_STATE_UNDEF, 0, rc); + ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_NEVER_LAUNCHED); return rc; } @@ -912,7 +830,7 @@ static int odls_base_default_setup_fork(orte_app_context_t *context, return ORTE_SUCCESS; } -static int setup_child(orte_odls_child_t *child, orte_odls_job_t *jobdat, char ***env) +static int setup_child(orte_proc_t *child, orte_job_t *jobdat, char ***env) { char *param, *value; orte_node_rank_t node_rank; @@ -920,7 +838,7 @@ static int setup_child(orte_odls_child_t *child, orte_odls_job_t *jobdat, char * int rc; /* setup the jobid */ - if (ORTE_SUCCESS != (rc = orte_util_convert_jobid_to_string(&value, child->name->jobid))) { + if (ORTE_SUCCESS != (rc = orte_util_convert_jobid_to_string(&value, child->name.jobid))) { ORTE_ERROR_LOG(rc); return rc; } @@ -933,24 +851,8 @@ static int setup_child(orte_odls_child_t *child, orte_odls_job_t *jobdat, char * free(param); free(value); -#if ORTE_ENABLE_EPOCH - /* setup the epoch */ - if (ORTE_SUCCESS != (rc = orte_util_convert_epoch_to_string(&value, child->name->epoch))) { - ORTE_ERROR_LOG(rc); - return rc; - } - if (NULL == (param = mca_base_param_environ_variable("orte","ess","epoch"))) { - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - rc = ORTE_ERR_OUT_OF_RESOURCE; - return rc; - } - opal_setenv(param, value, true, env); - free(param); - free(value); -#endif - /* setup the vpid */ - if (ORTE_SUCCESS != (rc = orte_util_convert_vpid_to_string(&value, child->name->vpid))) { + if (ORTE_SUCCESS != (rc = orte_util_convert_vpid_to_string(&value, child->name.vpid))) { ORTE_ERROR_LOG(rc); return rc; } @@ -980,7 +882,7 @@ static int setup_child(orte_odls_child_t *child, orte_odls_job_t *jobdat, char * * AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT. * We know - just live with it */ - if (ORTE_LOCAL_RANK_INVALID == (local_rank = orte_ess.get_local_rank(child->name))) { + if (ORTE_LOCAL_RANK_INVALID == (local_rank = orte_ess.get_local_rank(&child->name))) { ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS); rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS; return rc; @@ -996,7 +898,7 @@ static int setup_child(orte_odls_child_t *child, orte_odls_job_t *jobdat, char * * AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT. * We know - just live with it */ - if (ORTE_NODE_RANK_INVALID == (node_rank = orte_ess.get_node_rank(child->name))) { + if (ORTE_NODE_RANK_INVALID == (node_rank = orte_ess.get_node_rank(&child->name))) { ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS); rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS; return rc; @@ -1123,34 +1025,38 @@ static int setup_path(orte_app_context_t *app) /* define a timer release point so that we can wait for * file descriptors to come available, if necessary */ -static bool time_is_up; - static void timer_cb(int fd, short event, void *cbdata) { - opal_event_t *ev = (opal_event_t*)cbdata; - - /* free event */ - if (NULL != ev) { - free(ev); - } - /* declare time is up */ - time_is_up = true; + orte_timer_t *tm = (orte_timer_t*)cbdata; + orte_odls_launch_local_t *ll = (orte_odls_launch_local_t*)tm->payload; + + /* increment the number of retries */ + ll->retries++; + + /* re-attempt the launch */ + opal_event_active(ll->ev, OPAL_EV_WRITE, 1); + + /* release the timer event */ + OBJ_RELEASE(tm); } static int compute_num_procs_alive(orte_jobid_t job) { - opal_list_item_t *item; - orte_odls_child_t *child; + int i; + orte_proc_t *child; int num_procs_alive = 0; - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end (&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; + for (i=0; i < orte_local_children->size; i++) { + if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { + continue; + } if (!child->alive) { continue; } - if (job == child->name->jobid) { + /* do not include members of the specified job as they + * will be added later, if required + */ + if (job == child->name.jobid) { continue; } num_procs_alive++; @@ -1158,28 +1064,25 @@ static int compute_num_procs_alive(orte_jobid_t job) return num_procs_alive; } -int orte_odls_base_default_launch_local(orte_jobid_t job, - orte_odls_base_fork_local_proc_fn_t fork_local) + +void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata) { - opal_list_item_t *item; orte_app_context_t *app, *dbg; - orte_odls_child_t *child=NULL; + orte_proc_t *child=NULL; bool oversubscribed; int rc=ORTE_SUCCESS; - bool launch_failed=true; - opal_buffer_t alert; + opal_buffer_t *alert; orte_std_cntr_t proc_rank; - orte_odls_job_t *jobdat; char basedir[MAXPATHLEN]; char **argvsav=NULL; - int inm, j; - opal_event_t *delay; + int inm, j, idx; int total_num_local_procs = 0; orte_nid_t *nid; orte_node_t *node; - - /* protect operations involving the global list of children */ - OPAL_THREAD_LOCK(&orte_odls_globals.mutex); + orte_odls_launch_local_t *caddy = (orte_odls_launch_local_t*)cbdata; + orte_job_t *jobdat; + orte_jobid_t job = caddy->job; + orte_odls_base_fork_local_proc_fn_t fork_local = caddy->fork_local; /* establish our baseline working directory - we will be potentially * bouncing around as we execute various apps, but we will always return @@ -1188,29 +1091,19 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, getcwd(basedir, sizeof(basedir)); /* find the jobdat for this job */ - jobdat = NULL; - for (item = opal_list_get_first(&orte_local_jobdata); - item != opal_list_get_end(&orte_local_jobdata); - item = opal_list_get_next(item)) { - jobdat = (orte_odls_job_t*)item; - - /* is this the specified job? */ - if (jobdat->jobid == job) { - break; - } - } - if (NULL == jobdat) { + if (NULL == (jobdat = orte_get_job_data_object(job))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - rc = ORTE_ERR_NOT_FOUND; - goto GETOUT; + /* not much we can do here - we are just hosed, so + * report that to the error manager + */ + ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_FAILED_TO_LAUNCH); + goto ERROR_OUT; } /* do we have any local procs to launch? */ if (0 == jobdat->num_local_procs) { - /* no - just return */ - opal_condition_signal(&orte_odls_globals.cond); - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); - return ORTE_SUCCESS; + /* indicate that we are done trying to launch them */ + goto GETOUT; } /* see if the mapper thinks we are oversubscribed */ @@ -1219,8 +1112,8 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, /* just fake it - we don't keep a local nidmap */ if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - rc = ORTE_ERR_NOT_FOUND; - goto CLEANUP; + ORTE_ACTIVATE_JOB_STATE(jobdat, ORTE_JOB_STATE_FAILED_TO_LAUNCH); + goto ERROR_OUT; } if (node->oversubscribed) { oversubscribed = true; @@ -1249,8 +1142,8 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, /* Now we preload any files that are needed. This is done on a per * app context basis */ - for (j=0; j < jobdat->apps.size; j++) { - if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(&jobdat->apps, j))) { + for (j=0; j < jobdat->apps->size; j++) { + if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jobdat->apps, j))) { continue; } if(app->used_on_node && @@ -1263,8 +1156,8 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, } #if OPAL_ENABLE_FT_CR == 1 - for (j=0; j < jobdat->apps.size; j++) { - if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(&jobdat->apps, j))) { + for (j=0; j < jobdat->apps->size; j++) { + if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jobdat->apps, j))) { continue; } orte_sstore.fetch_app_deps(app); @@ -1273,13 +1166,13 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, #endif /* setup to report the proc state to the HNP */ - OBJ_CONSTRUCT(&alert, opal_buffer_t); + alert = OBJ_NEW(opal_buffer_t); /* compute the total number of local procs currently alive and about to be launched */ total_num_local_procs = compute_num_procs_alive(job) + jobdat->num_local_procs; - for (j=0; j < jobdat->apps.size; j++) { - if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(&jobdat->apps, j))) { + for (j=0; j < jobdat->apps->size; j++) { + if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jobdat->apps, j))) { continue; } @@ -1299,24 +1192,17 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), opal_sys_limits.num_procs, total_num_local_procs)); if (opal_sys_limits.num_procs < total_num_local_procs) { - /* don't have enough - wait a little time */ - time_is_up = false; - ORTE_DETECT_TIMEOUT(&delay, 1000, 1000, -1, timer_cb); - /* wait */ - ORTE_PROGRESSED_WAIT(time_is_up, 0, 1); - /* recompute the num local procs */ - total_num_local_procs = compute_num_procs_alive(job) + jobdat->num_local_procs; - /* see if we still have a problem */ - OPAL_OUTPUT_VERBOSE((10, orte_odls_globals.output, - "%s rechecking limit on num procs %d #children needed %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - opal_sys_limits.num_procs, total_num_local_procs)); - if (opal_sys_limits.num_procs < total_num_local_procs) { - /* at the system limit - abort */ - ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_CHILDREN); - rc = ORTE_ERR_SYS_LIMITS_CHILDREN; - goto CLEANUP; + if (2 < caddy->retries) { + /* if we have already tried too many times, then just give up */ + ORTE_ACTIVATE_JOB_STATE(jobdat, ORTE_JOB_STATE_FAILED_TO_LAUNCH); + goto ERROR_OUT; } + /* set a timer event so we can retry later - this + * gives the system a chance to let other procs + * terminate, thus creating room for new ones + */ + ORTE_DETECT_TIMEOUT(1000, 1000, -1, timer_cb, caddy); + return; } } @@ -1326,7 +1212,7 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, jobdat->num_local_procs, jobdat->num_procs, jobdat->total_slots_alloc, - jobdat->num_nodes, + jobdat->map->num_nodes, oversubscribed, &app->env))) { @@ -1343,17 +1229,17 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, * so we can report things out correctly */ /* cycle through children to find those for this jobid */ - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; - if (OPAL_EQUAL == opal_dss.compare(&job, &(child->name->jobid), ORTE_JOBID) && + for (idx=0; idx < orte_local_children->size; idx++) { + if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, idx))) { + continue; + } + if (OPAL_EQUAL == opal_dss.compare(&job, &(child->name.jobid), ORTE_JOBID) && j == (int)child->app_idx) { child->exit_code = rc; + ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH); } } - /* okay, now tell the HNP we couldn't do it */ - goto CLEANUP; + goto GETOUT; } /* setup the working directory for this app - will jump us @@ -1372,24 +1258,24 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, * so we can report things out correctly */ /* cycle through children to find those for this jobid */ - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; - if (OPAL_EQUAL == opal_dss.compare(&job, &(child->name->jobid), ORTE_JOBID) && + for (idx=0; idx < orte_local_children->size; idx++) { + if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, idx))) { + continue; + } + if (OPAL_EQUAL == opal_dss.compare(&job, &(child->name.jobid), ORTE_JOBID) && j == (int)child->app_idx) { child->exit_code = rc; + ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH); } } - /* okay, now tell the HNP we couldn't do it */ - goto CLEANUP; + goto GETOUT; } /* okay, now let's launch all the local procs for this app using the provided fork_local fn */ - for (proc_rank = 0, item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; + for (proc_rank = 0, idx=0; idx < orte_local_children->size; idx++) { + if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, idx))) { + continue; + } /* does this child belong to this app? */ if (j != (int)child->app_idx) { @@ -1403,9 +1289,9 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, if (child->alive) { OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, - "%s odls:launch child %s is already alive", + "%s odls:launch child %s has already been launched", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name))); + ORTE_NAME_PRINT(&child->name))); continue; } @@ -1414,12 +1300,12 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, * job could be given as a WILDCARD value, we must use * the dss.compare function to check for equality. */ - if (OPAL_EQUAL != opal_dss.compare(&job, &(child->name->jobid), ORTE_JOBID)) { + if (OPAL_EQUAL != opal_dss.compare(&job, &(child->name.jobid), ORTE_JOBID)) { OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls:launch child %s is not in job %s being launched", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name), + ORTE_NAME_PRINT(&child->name), ORTE_JOBID_PRINT(job))); continue; @@ -1428,12 +1314,11 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls:launch working child %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name))); + ORTE_NAME_PRINT(&child->name))); /* ensure we clear any prior info regarding state or exit status in * case this is a restart */ - child->state = ORTE_PROC_STATE_FAILED_TO_START; child->exit_code = 0; child->waitpid_recvd = false; /* if we are not forwarding output for this job, then @@ -1444,7 +1329,6 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, } else { child->iof_complete = true; } - child->coll_recvd = false; child->pid = 0; if (NULL != child->rml_uri) { free(child->rml_uri); @@ -1464,25 +1348,14 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), opal_sys_limits.num_files, limit)); if (opal_sys_limits.num_files < limit) { - /* don't have enough - wait a little time */ - time_is_up = false; - ORTE_DETECT_TIMEOUT(&delay, 1000, 1000, -1, timer_cb); - /* wait */ - ORTE_PROGRESSED_WAIT(time_is_up, 0, 1); - /* recompute the num procs alive */ - total_num_local_procs = compute_num_procs_alive(job) + jobdat->num_local_procs; - /* see if we still have a problem */ - limit = 4*total_num_local_procs + 6; - OPAL_OUTPUT_VERBOSE((10, orte_odls_globals.output, - "%s rechecking limit on file descriptors %d need %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - opal_sys_limits.num_files, limit)); - if (opal_sys_limits.num_files < limit) { - /* nope - abort */ - ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_PIPES); - child->exit_code = rc; - goto CLEANUP; + if (2 < caddy->retries) { + /* tried enough - give up */ + ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH); + continue; } + /* don't have enough - wait a little time */ + ORTE_DETECT_TIMEOUT(1000, 1000, -1, timer_cb, caddy); + return; } } @@ -1496,7 +1369,7 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, nmitem = opal_list_get_next(nmitem)) { nm = (orte_namelist_t*)nmitem; if (ORTE_VPID_WILDCARD == nm->name.vpid || - child->name->vpid == nm->name.vpid) { + child->name.vpid == nm->name.vpid) { /* we want this one - modify the app's command to include * the orte xterm cmd. Need to be careful, though, that we * don't modify the app for ALL ranks that use it! So we @@ -1512,7 +1385,7 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, } /* insert the rank into the correct place as a window title */ free(app->argv[2]); - asprintf(&app->argv[2], "Rank %s", ORTE_VPID_PRINT(child->name->vpid)); + asprintf(&app->argv[2], "Rank %s", ORTE_VPID_PRINT(child->name.vpid)); /* add back the original argv */ for (inm=0; inm < opal_argv_count(argvsav); inm++) { opal_argv_append_nosize(&app->argv, argvsav[inm]); @@ -1528,9 +1401,9 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, orte_show_help("help-orte-odls-base.txt", "orte-odls-base:xterm-rank-out-of-bounds", true, nm->name.vpid, jobdat->num_procs); - rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS; child->exit_code = ORTE_ERR_SILENT; - goto CLEANUP; + ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH); + continue; } } @@ -1555,9 +1428,9 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, orte_show_help("help-orte-odls-base.txt", "orte-odls-base:fork-agent-not-found", true, orte_process_info.nodename, orte_fork_agent[0]); - rc = ORTE_ERR_SILENT; child->exit_code = ORTE_ERR_SILENT; - goto CLEANUP; + ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH); + continue; } } @@ -1566,20 +1439,10 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, */ if (ORTE_SUCCESS != (rc = setup_child(child, jobdat, &app->env))) { ORTE_ERROR_LOG(rc); - goto CLEANUP; + ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH); + continue; } - /* if we are timing things, record when we are going to launch this proc */ - if (orte_timing) { - gettimeofday(&child->starttime, NULL); - } - - /* must unlock prior to fork to keep things clean in the - * event library - */ - opal_condition_signal(&orte_odls_globals.cond); - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); - #if OPAL_ENABLE_FT_CR == 1 /* * OPAL CRS components need the opportunity to take action before a process @@ -1590,21 +1453,22 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, * - Binary to exec */ if( NULL != opal_crs.crs_prelaunch ) { - if( OPAL_SUCCESS != (rc = opal_crs.crs_prelaunch(child->name->vpid, + if( OPAL_SUCCESS != (rc = opal_crs.crs_prelaunch(child->name.vpid, orte_sstore_base_prelaunch_location, &(app->app), &(app->cwd), &(app->argv), &(app->env) ) ) ) { ORTE_ERROR_LOG(rc); - goto CLEANUP; + ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH); + continue; } } #endif if (5 < opal_output_get_verbosity(orte_odls_globals.output)) { opal_output(orte_odls_globals.output, "%s odls:launch: spawning child %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name)); + ORTE_NAME_PRINT(&child->name)); /* dump what is going to be exec'd */ if (7 < opal_output_get_verbosity(orte_odls_globals.output)) { @@ -1613,24 +1477,17 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, } rc = fork_local(app, child, app->env, jobdat); - /* reaquire lock so we don't double unlock... */ - OPAL_THREAD_LOCK(&orte_odls_globals.mutex); if (ORTE_SUCCESS != rc) { /* do NOT ERROR_LOG this error - it generates * a message/node as most errors will be common * across the entire cluster. Instead, we let orterun * output a consolidated error message for us */ - goto CLEANUP; + ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_START); + continue; } else { child->alive = true; - child->state = ORTE_PROC_STATE_LAUNCHED; - if (ORTE_SUCCESS != (rc = orte_errmgr.update_state(child->name->jobid, ORTE_JOB_STATE_LAUNCHED, - child->name, child->state, - child->pid, child->exit_code))) { - ORTE_ERROR_LOG(rc); - goto CLEANUP; - } + ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_RUNNING); } /* move to next processor */ proc_rank++; @@ -1655,39 +1512,25 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, */ chdir(basedir); } - launch_failed = false; - CLEANUP: - /* ensure we reset our working directory back to our default location */ - chdir(basedir); - - OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, - "%s odls:launch reporting job %s launch status", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(job))); - - /* if the launch failed, we need to flag all the procs from this job - * that didn't launch as having failed, or else we will hang + /* check to see if we need to + * co-locate any debugger daemons so that they get launched + * before we report anything to the HNP. This ensures that + * the debugger daemons are ready-to-go before mpirun returns + * from the plm.spawn command. Only spawn the debugger, though, + * if we have local children - otherwise, the HNP could spawn + * a debugger when it doesn't have any local procs */ - if (launch_failed) { - if (ORTE_SUCCESS != (rc = orte_errmgr.update_state(jobdat->jobid, ORTE_JOB_STATE_FAILED_TO_START, - NULL, ORTE_PROC_STATE_UNDEF, 0, - child->exit_code))) { - ORTE_ERROR_LOG(rc); + if (NULL != orte_odls_globals.debugger && + !orte_odls_globals.debugger_launched) { + child = NULL; + for (idx=0; idx < orte_local_children->size; idx++) { + if (NULL != (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, idx))) { + break; + } } - } else { - /* if the launch succeeded, check to see if we need to - * co-locate any debugger daemons so that they get launched - * before we report anything to the HNP. This ensures that - * the debugger daemons are ready-to-go before mpirun returns - * from the plm.spawn command. Only spawn the debugger, though, - * if we have local children - otherwise, the HNP could spawn - * a debugger when it doesn't have any local procs - */ - if (NULL != orte_odls_globals.debugger && - !orte_odls_globals.debugger_launched && - 0 < opal_list_get_size(&orte_local_children)) { - dbg = (orte_app_context_t*)opal_pointer_array_get_item(&orte_odls_globals.debugger->apps, 0); + if (NULL != child) { + dbg = (orte_app_context_t*)opal_pointer_array_get_item(orte_odls_globals.debugger->apps, 0); OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls:launch forking debugger %s with %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), dbg->app, @@ -1698,77 +1541,71 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, orte_process_info.num_procs, false, &dbg->env); fork_local(dbg, NULL, dbg->env, orte_odls_globals.debugger); orte_odls_globals.debugger_launched = true; - if (ORTE_SUCCESS != (rc = orte_errmgr.update_state(orte_odls_globals.debugger->jobid, - ORTE_JOB_STATE_RUNNING, - NULL, ORTE_PROC_STATE_UNDEF, 0, - ORTE_ERROR_DEFAULT_EXIT_CODE))) { - ORTE_ERROR_LOG(rc); - } + orte_odls_globals.debugger->state = ORTE_JOB_STATE_RUNNING; } + } + + OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, + "%s odls:launch setting waitpids", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - if (ORTE_SUCCESS != (rc = orte_errmgr.update_state(jobdat->jobid, ORTE_JOB_STATE_RUNNING, - NULL, ORTE_PROC_STATE_UNDEF, 0, - ORTE_ERROR_DEFAULT_EXIT_CODE))) { - ORTE_ERROR_LOG(rc); + /* start the sensors for this job (if any) */ + orte_sensor.start(jobdat->jobid); + + /* setup the waitpids on the children that started */ + for (idx=0; idx < orte_local_children->size; idx++) { + if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, idx))) { + continue; } - - OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, - "%s odls:launch setting waitpids", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - /* start the sensors for this job (if any) */ - orte_sensor.start(jobdat->jobid); - - /* if the launch didn't fail, setup the waitpids on the children */ - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; - - if (child->name->jobid == jobdat->jobid) { - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); - orte_wait_cb(child->pid, odls_base_default_wait_local_proc, NULL); - OPAL_THREAD_LOCK(&orte_odls_globals.mutex); - } + if (child->name.jobid == jobdat->jobid && child->alive) { + orte_wait_cb(child->pid, odls_base_default_wait_local_proc, NULL); } } GETOUT: - opal_condition_signal(&orte_odls_globals.cond); - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); - return rc; + /* tell the state machine that all local procs for this job + * were launched so that it can do whatever it needs to do, + * like send a state update message for all procs to the HNP + */ + ORTE_ACTIVATE_JOB_STATE(jobdat, ORTE_JOB_STATE_LOCAL_LAUNCH_COMPLETE); + + ERROR_OUT: + /* ensure we reset our working directory back to our default location */ + chdir(basedir); + /* release the event */ + OBJ_RELEASE(caddy); } int orte_odls_base_default_deliver_message(orte_jobid_t job, opal_buffer_t *buffer, orte_rml_tag_t tag) { int rc, exit_status = ORTE_SUCCESS; - opal_list_item_t *item; - orte_odls_child_t *child; - - /* protect operations involving the global list of children */ - OPAL_THREAD_LOCK(&orte_odls_globals.mutex); - - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; + int i; + orte_proc_t *child; + opal_buffer_t *relay; + + for (i=0; i < orte_local_children->size; i++) { + if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { + continue; + } /* do we have a child from the specified job. Because the * job could be given as a WILDCARD value, we must use * the dss.compare function to check for equality. */ if (!child->alive || - OPAL_EQUAL != opal_dss.compare(&job, &(child->name->jobid), ORTE_JOBID)) { + OPAL_EQUAL != opal_dss.compare(&job, &(child->name.jobid), ORTE_JOBID)) { continue; } OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls: sending message to tag %lu on child %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - (unsigned long)tag, ORTE_NAME_PRINT(child->name))); + (unsigned long)tag, ORTE_NAME_PRINT(&child->name))); /* if so, send the message */ - rc = orte_rml.send_buffer(child->name, buffer, tag, 0); + relay = OBJ_NEW(opal_buffer_t); + opal_dss.copy_payload(relay, buffer); + rc = orte_rml.send_buffer_nb(&child->name, relay, tag, 0, orte_rml_send_callback, NULL); if (rc < 0 && rc != ORTE_ERR_ADDRESSEE_UNKNOWN) { /* ignore if the addressee is unknown as a race condition could * have allowed the child to exit before we send it a barrier @@ -1780,14 +1617,12 @@ int orte_odls_base_default_deliver_message(orte_jobid_t job, opal_buffer_t *buff */ ORTE_ERROR_LOG(rc); exit_status = rc; + OBJ_RELEASE(relay); goto cleanup; } } cleanup: - opal_condition_signal(&orte_odls_globals.cond); - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); - return exit_status; } @@ -1799,45 +1634,36 @@ int orte_odls_base_default_deliver_message(orte_jobid_t job, opal_buffer_t *buff int orte_odls_base_default_signal_local_procs(const orte_process_name_t *proc, int32_t signal, orte_odls_base_signal_local_fn_t signal_local) { - int rc; - opal_list_item_t *item; - orte_odls_child_t *child; + int rc, i; + orte_proc_t *child; OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls: signaling proc %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc))); - /* protect operations involving the global list of children */ - OPAL_THREAD_LOCK(&orte_odls_globals.mutex); - /* if procs is NULL, then we want to signal all * of the local procs, so just do that case */ if (NULL == proc) { rc = ORTE_SUCCESS; /* pre-set this as an empty list causes us to drop to bottom */ - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; + for (i=0; i < orte_local_children->size; i++) { + if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { + continue; + } if (ORTE_SUCCESS != (rc = signal_local(child->pid, (int)signal))) { ORTE_ERROR_LOG(rc); } } - opal_condition_signal(&orte_odls_globals.cond); - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); return rc; } /* we want it sent to some specified process, so find it */ - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; + for (i=0; i < orte_local_children->size; i++) { + if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { + continue; + } if (OPAL_EQUAL == opal_dss.compare(&(child->name), (orte_process_name_t*)proc, ORTE_NAME)) { - /* unlock before signaling as this may generate a callback */ - opal_condition_signal(&orte_odls_globals.cond); - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); if (ORTE_SUCCESS != (rc = signal_local(child->pid, (int)signal))) { ORTE_ERROR_LOG(rc); } @@ -1849,14 +1675,12 @@ int orte_odls_base_default_signal_local_procs(const orte_process_name_t *proc, i * report that as an error and return it */ ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - opal_condition_signal(&orte_odls_globals.cond); - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); return ORTE_ERR_NOT_FOUND; } void orte_odls_base_setup_singleton_jobdat(orte_jobid_t jobid) { - orte_odls_job_t *jobdat; + orte_job_t *jobdat; orte_vpid_t vpid1; int32_t one32; orte_local_rank_t lrank; @@ -1870,11 +1694,11 @@ void orte_odls_base_setup_singleton_jobdat(orte_jobid_t jobid) #endif /* create a job tracking object for it */ - jobdat = OBJ_NEW(orte_odls_job_t); + jobdat = OBJ_NEW(orte_job_t); jobdat->jobid = jobid; jobdat->num_procs = 1; jobdat->num_local_procs = 1; - opal_list_append(&orte_local_jobdata, &jobdat->super); + opal_pointer_array_set_item(orte_job_data, ORTE_LOCAL_JOBID(jobid), jobdat); /* need to setup a pidmap for it */ OBJ_CONSTRUCT(&buffer, opal_buffer_t); opal_dss.pack(&buffer, &jobid, 1, ORTE_JOBID); /* jobid */ @@ -1925,90 +1749,64 @@ void orte_odls_base_setup_singleton_jobdat(orte_jobid_t jobid) } free(bo); } - /* flag that the "launch msg" has been processed so that daemon - * collectives can proceed - */ - jobdat->launch_msg_processed = true; } int orte_odls_base_default_require_sync(orte_process_name_t *proc, opal_buffer_t *buf, bool drop_nidmap) { - opal_buffer_t buffer; - opal_list_item_t *item; - orte_odls_child_t *child; + opal_buffer_t *buffer; + orte_proc_t *child; orte_std_cntr_t cnt; - int rc=ORTE_SUCCESS; + int rc=ORTE_SUCCESS, i; bool found=false, registering=false; - orte_odls_job_t *jobdat, *jdat; + orte_job_t *jobdat; OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls: require sync on child %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); - /* protect operations involving the global list of children */ - OPAL_THREAD_LOCK(&orte_odls_globals.mutex); - - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; + for (i=0; i < orte_local_children->size; i++) { + if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { + continue; + } /* find this child */ - if (OPAL_EQUAL == opal_dss.compare(proc, child->name, ORTE_NAME)) { + if (OPAL_EQUAL == opal_dss.compare(proc, &child->name, ORTE_NAME)) { OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls: registering sync on child %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name))); + ORTE_NAME_PRINT(&child->name))); found = true; break; } } - /* if it wasn't found on the list, then we need to add it - must have - * come from a singleton - */ + /* if it wasn't found, that's an error */ if (!found) { - OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, - "%s odls: registering sync on singleton %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(proc))); - child = OBJ_NEW(orte_odls_child_t); - if (ORTE_SUCCESS != (rc = opal_dss.copy((void**)&child->name, proc, ORTE_NAME))) { - ORTE_ERROR_LOG(rc); - goto CLEANUP; - } - opal_list_append(&orte_local_children, &child->super); - /* we don't know any other info about the child, so just indicate it's - * alive - */ - child->alive = true; - /* setup jobdat object for its job so daemon collectives work */ - orte_odls_base_setup_singleton_jobdat(proc->jobid); + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + return ORTE_ERR_NOT_FOUND; } - - /* if the contact info is already set, then we are "de-registering" the child - * so free the info and set it to NULL - */ - if (child->init_recvd && NULL != child->rml_uri) { - child->fini_recvd = true; + + /* if the child has registered, then we are "de-registering" the child */ + if (child->registered) { + child->deregistered = true; OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls: require sync deregistering child %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name))); + ORTE_NAME_PRINT(&child->name))); } else { - /* if the contact info is not set, then we are registering the child so + /* otherwise, we are registering the child so * unpack the contact info from the buffer and store it */ OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls: require sync registering child %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name))); - child->init_recvd = true; + ORTE_NAME_PRINT(&child->name))); + child->registered = true; registering = true; cnt = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(buf, &(child->rml_uri), &cnt, OPAL_STRING))) { @@ -2017,29 +1815,20 @@ int orte_odls_base_default_require_sync(orte_process_name_t *proc, } /* ack the call */ - OBJ_CONSTRUCT(&buffer, opal_buffer_t); + buffer = OBJ_NEW(opal_buffer_t); /* do they want the nidmap? */ if (drop_nidmap) { /* get the jobdata object */ - jobdat = NULL; - for (item = opal_list_get_first(&orte_local_jobdata); - item != opal_list_get_end(&orte_local_jobdata); - item = opal_list_get_next(item)) { - jdat = (orte_odls_job_t*)item; - if (jdat->jobid == child->name->jobid) { - jobdat = jdat; - break; - } - } - if (NULL == jobdat) { + if (NULL == (jobdat = orte_get_job_data_object(child->name.jobid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); goto CLEANUP; } - OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, - "%s odls:sync nidmap requested for job %s", + "%s odls:sync nidmap requested for job %s: dmap %s pmap %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(jobdat->jobid))); + ORTE_JOBID_PRINT(jobdat->jobid), + (NULL == orte_odls_globals.dmap) ? "NULL" : "READY", + (NULL == jobdat->pmap) ? "NULL" : "READY")); /* the proc needs a copy of both the daemon/node map, and * the process map for its peers */ @@ -2053,25 +1842,25 @@ int orte_odls_base_default_require_sync(orte_process_name_t *proc, /* send the local topology so the individual apps * don't hammer the system to collect it themselves */ - opal_dss.pack(&buffer, &opal_hwloc_topology, 1, OPAL_HWLOC_TOPO); + opal_dss.pack(buffer, &opal_hwloc_topology, 1, OPAL_HWLOC_TOPO); #endif - opal_dss.pack(&buffer, &orte_odls_globals.dmap, 1, OPAL_BYTE_OBJECT); - opal_dss.pack(&buffer, &jobdat->pmap, 1, OPAL_BYTE_OBJECT); + opal_dss.pack(buffer, &orte_odls_globals.dmap, 1, OPAL_BYTE_OBJECT); + opal_dss.pack(buffer, &jobdat->pmap, 1, OPAL_BYTE_OBJECT); } } OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls: sending sync ack to child %s with %ld bytes of data", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(proc), (long)buffer.bytes_used)); + ORTE_NAME_PRINT(proc), (long)buffer->bytes_used)); - if (0 > (rc = orte_rml.send_buffer(proc, &buffer, ORTE_RML_TAG_SYNC, 0))) { + if (0 > (rc = orte_rml.send_buffer_nb(proc, buffer, ORTE_RML_TAG_SYNC, + 0, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); - OBJ_DESTRUCT(&buffer); + OBJ_RELEASE(buffer); goto CLEANUP; } rc = ORTE_SUCCESS; - OBJ_DESTRUCT(&buffer); OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls: Finished sending sync ack to child %s (Registering %s)", @@ -2080,7 +1869,7 @@ int orte_odls_base_default_require_sync(orte_process_name_t *proc, /* if we are deregistering, then we are done */ if (!registering) { - orte_routed.delete_route(child->name); + orte_routed.delete_route(&child->name); if( NULL != child->rml_uri ) { free(child->rml_uri); child->rml_uri = NULL; @@ -2089,208 +1878,94 @@ int orte_odls_base_default_require_sync(orte_process_name_t *proc, } /* update the proc state */ - orte_errmgr.update_state(ORTE_JOBID_INVALID, ORTE_JOB_STATE_UNDEF, - proc, ORTE_PROC_STATE_REGISTERED, 0, 0); + ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_REGISTERED); CLEANUP: - opal_condition_signal(&orte_odls_globals.cond); - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); return rc; } -/* receive external-to-odls notification that a proc has met some completion - * requirements - */ -void orte_odls_base_notify_iof_complete(orte_process_name_t *proc) -{ - orte_odls_child_t *child; - opal_list_item_t *item; - int rc; - orte_ns_cmp_bitmask_t mask; - - OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, - "%s odls:notify_iof_complete for child %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(proc))); - - /* since we are going to be working with the global list of - * children, we need to protect that list from modification - * by other threads. This will also be used to protect us - * from race conditions on any abort situation - */ - OPAL_THREAD_LOCK(&orte_odls_globals.mutex); - - /* find this child */ - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; - - mask = ORTE_NS_CMP_ALL; - - if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) { /* found it */ - goto GOTCHILD; - } - } - /* get here if we didn't find the child, or if the specified child - * is already dead. If the latter, then we have a problem as it - * means we are detecting it exiting multiple times - */ - - OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, - "%s odls:proc_complete did not find child %s in table!", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(proc))); - - /* it's just a race condition - don't error log it */ - opal_condition_signal(&orte_odls_globals.cond); - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); - return; - -GOTCHILD: - /* flag the iof as complete */ - child->iof_complete = true; - /* now check to see if the proc is truly done */ - if (child->waitpid_recvd) { - /* CHILD IS COMPLETE */ - child->alive = false; - - /* Release only the stdin IOF file descriptor for this child, if one - * was defined. File descriptors for the other IOF channels - stdout, - * stderr, and stddiag - were released when their associated pipes - * were cleared and closed due to termination of the process - */ - if (NULL != orte_iof.close) { - orte_iof.close(proc, ORTE_IOF_STDIN); - } - /* Clean up the session directory as if we were the process - * itself. This covers the case where the process died abnormally - * and didn't cleanup its own session directory. - */ - orte_session_dir_finalize(proc); - /* alert the errmgr */ - if (ORTE_SUCCESS != (rc = orte_errmgr.update_state(ORTE_JOBID_INVALID, ORTE_JOB_STATE_UNDEF, - proc, child->state, child->pid, - child->exit_code))) { - ORTE_ERROR_LOG(rc); - } - } - - opal_condition_signal(&orte_odls_globals.cond); - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); -} - void orte_odls_base_default_report_abort(orte_process_name_t *proc) { - orte_odls_child_t *child; - opal_list_item_t *item; - opal_buffer_t buffer; - int rc; + orte_proc_t *child; + opal_buffer_t *buffer; + int rc, i; orte_ns_cmp_bitmask_t mask; - /* since we are going to be working with the global list of - * children, we need to protect that list from modification - * by other threads. This will also be used to protect us - * from race conditions on any abort situation - */ - OPAL_THREAD_LOCK(&orte_odls_globals.mutex); - /* find this child */ - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; + for (i=0; i < orte_local_children->size; i++) { + if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { + continue; + } mask = ORTE_NS_CMP_ALL; if (OPAL_EQUAL == - orte_util_compare_name_fields(mask, proc, child->name)) { /* found it */ + orte_util_compare_name_fields(mask, proc, &child->name)) { /* found it */ child->state = ORTE_PROC_STATE_CALLED_ABORT; /* send ack */ - OBJ_CONSTRUCT(&buffer, opal_buffer_t); - if (0 > (rc = orte_rml.send_buffer(proc, &buffer, ORTE_RML_TAG_ABORT, 0))) { + buffer = OBJ_NEW(opal_buffer_t); + if (0 > (rc = orte_rml.send_buffer_nb(proc, buffer, + ORTE_RML_TAG_ABORT, 0, + orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); + OBJ_RELEASE(buffer); } - OBJ_DESTRUCT(&buffer); break; } } - opal_condition_signal(&orte_odls_globals.cond); - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); } -void orte_base_default_waitpid_fired(orte_process_name_t *proc, int32_t status) +/* + * Wait for a callback indicating the child has completed. + */ + +void odls_base_default_wait_local_proc(pid_t pid, int status, void* cbdata) { - orte_odls_child_t *child, *chd; - orte_odls_job_t *jobdat, *jdat; - opal_list_item_t *item; - int rc; - orte_ns_cmp_bitmask_t mask; + orte_proc_t *proc=NULL, *cptr; + int i; + orte_job_t *jobdat; + orte_proc_state_t state=ORTE_PROC_STATE_WAITPID_FIRED; OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, - "%s odls:waitpid_fired on child %s with status %d", + "%s odls:wait_local_proc child process %ld terminated", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(proc), WEXITSTATUS(status))); - - /* since we are going to be working with the global list of - * children, we need to protect that list from modification - * by other threads. This will also be used to protect us - * from race conditions on any abort situation - */ - OPAL_THREAD_LOCK(&orte_odls_globals.mutex); + (long)pid)); /* find this child */ - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; - - mask = ORTE_NS_CMP_ALL; - - if (OPAL_EQUAL == - orte_util_compare_name_fields(mask, proc, child->name)) { /* found it */ - goto GOTCHILD; + for (i=0; i < orte_local_children->size; i++) { + if (NULL == (cptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { + continue; + } + if (pid == cptr->pid) { + proc = cptr; + break; } } - /* get here if we didn't find the child, or if the specified child - * is already dead. If the latter, then we have a problem as it - * means we are detecting it exiting multiple times - */ - - OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, - "%s odls:waitpid_fired did not find child %s in table!", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(proc))); - - /* it's just a race condition - don't error log it */ - opal_condition_signal(&orte_odls_globals.cond); - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); - return; - - GOTCHILD: + if (NULL == proc) { + /* get here if we didn't find the child, or if the specified child + * is already dead. If the latter, then we have a problem as it + * means we are detecting it exiting multiple times + */ + OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, + "%s odls:wait_local_proc did not find pid %ld in table!", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (long)pid)); + return; + } + /* if the child was previously flagged as dead, then just * ensure that its exit state gets reported to avoid hanging */ - if (!child->alive) { + if (!proc->alive) { OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls:waitpid_fired child %s was already dead", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name))); + ORTE_NAME_PRINT(&proc->name))); goto MOVEON; } /* get the jobdat for this child */ - jobdat = NULL; - for (item = opal_list_get_first(&orte_local_jobdata); - item != opal_list_get_end(&orte_local_jobdata); - item = opal_list_get_next(item)) { - jdat = (orte_odls_job_t*)item; - if (jdat->jobid == child->name->jobid) { - jobdat = jdat; - break; - } - } - if (NULL == jobdat) { + if (NULL == (jobdat = orte_get_job_data_object(proc->name.jobid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); goto MOVEON; } @@ -2299,27 +1974,26 @@ void orte_base_default_waitpid_fired(orte_process_name_t *proc, int32_t status) * and return as we aren't monitoring it */ if (ORTE_JOB_CONTROL_DEBUGGER_DAEMON & jobdat->controls) { - child->state = ORTE_PROC_STATE_TERMINATED; goto MOVEON; } /* if this child was ordered to die, then just pass that along * so we don't hang */ - if (ORTE_PROC_STATE_KILLED_BY_CMD == child->state) { + if (ORTE_PROC_STATE_KILLED_BY_CMD == proc->state) { OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls:waitpid_fired child %s was ordered to die", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name))); + ORTE_NAME_PRINT(&proc->name))); goto MOVEON; } /* determine the state of this process */ if(WIFEXITED(status)) { /* set the exit status appropriately */ - child->exit_code = WEXITSTATUS(status); + proc->exit_code = WEXITSTATUS(status); - if (ORTE_PROC_STATE_CALLED_ABORT == child->state) { + if (ORTE_PROC_STATE_CALLED_ABORT == proc->state) { /* even though the process exited "normally", it happened * via an orte_abort call, so we need to indicate this was * an "abnormal" termination. @@ -2327,81 +2001,69 @@ void orte_base_default_waitpid_fired(orte_process_name_t *proc, int32_t status) OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls:waitpid_fired child %s died by call to abort", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name))); - child->state = ORTE_PROC_STATE_ABORTED; + ORTE_NAME_PRINT(&proc->name))); + state = ORTE_PROC_STATE_CALLED_ABORT; goto MOVEON; } /* check to see if a sync was required and if it was received */ - if (child->init_recvd) { - if (!child->fini_recvd) { - /* we required a finalizing sync and didn't get it, so this - * is considered an abnormal termination and treated accordingly + if (proc->registered) { + if (proc->deregistered) { + /* if we did recv a finalize sync, then declare it normally terminated + * unless it returned with a non-zero status indicating the code + * felt it was non-normal */ - if (0 != child->exit_code) { - child->state = ORTE_PROC_STATE_TERM_NON_ZERO; + if (0 != proc->exit_code) { + proc->state = ORTE_PROC_STATE_TERM_NON_ZERO; OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls:waitpid_fired child process %s terminated normally " "but with a non-zero exit status - it " "will be treated as an abnormal termination", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name))); + ORTE_NAME_PRINT(&proc->name))); } else { - child->state = ORTE_PROC_STATE_TERM_WO_SYNC; - OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, - "%s odls:waitpid_fired child process %s terminated normally " - "but did not provide a required finalize sync - it " - "will be treated as an abnormal termination", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name))); + /* indicate the waitpid fired */ + state = ORTE_PROC_STATE_WAITPID_FIRED; } - - goto MOVEON; - } - /* if we did recv a finalize sync, then declare it normally terminated - * unless it returned with a non-zero status indicating the code - * felt it was non-normal - */ - if (0 != child->exit_code) { - child->state = ORTE_PROC_STATE_TERM_NON_ZERO; + } else { + /* we required a finalizing sync and didn't get it, so this + * is considered an abnormal termination and treated accordingly + */ + state = ORTE_PROC_STATE_TERM_WO_SYNC; OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls:waitpid_fired child process %s terminated normally " - "but with a non-zero exit status - it " + "but did not provide a required finalize sync - it " "will be treated as an abnormal termination", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name))); - } else { - child->state = ORTE_PROC_STATE_TERMINATED; + ORTE_NAME_PRINT(&proc->name))); } } else { /* has any child in this job already registered? */ - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - chd = (orte_odls_child_t*)item; - - if (chd->init_recvd) { + for (i=0; i < orte_local_children->size; i++) { + if (NULL == (cptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { + continue; + } + if (cptr->registered) { /* someone has registered, and we didn't before * terminating - this is an abnormal termination */ - if (0 != child->exit_code) { - child->state = ORTE_PROC_STATE_TERM_NON_ZERO; + if (0 != proc->exit_code) { + state = ORTE_PROC_STATE_TERM_NON_ZERO; OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls:waitpid_fired child process %s terminated normally " "but with a non-zero exit status - it " "will be treated as an abnormal termination", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name))); + ORTE_NAME_PRINT(&proc->name))); } else { - child->state = ORTE_PROC_STATE_TERM_WO_SYNC; + state = ORTE_PROC_STATE_TERM_WO_SYNC; OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls:waitpid_fired child process %s terminated normally " "but did not provide a required init sync - it " "will be treated as an abnormal termination", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name))); + ORTE_NAME_PRINT(&proc->name))); } - goto MOVEON; } } @@ -2409,23 +2071,23 @@ void orte_base_default_waitpid_fired(orte_process_name_t *proc, int32_t status) * none of them will. This is considered acceptable. Still * flag it as abnormal if the exit code was non-zero */ - if (0 != child->exit_code) { - child->state = ORTE_PROC_STATE_TERM_NON_ZERO; + if (0 != proc->exit_code) { + state = ORTE_PROC_STATE_TERM_NON_ZERO; } else { - child->state = ORTE_PROC_STATE_TERMINATED; + state = ORTE_PROC_STATE_WAITPID_FIRED; } } - + OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls:waitpid_fired child process %s terminated %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name), - (0 == child->exit_code) ? "normally" : "with non-zero status")); + ORTE_NAME_PRINT(&proc->name), + (0 == proc->exit_code) ? "normally" : "with non-zero status")); } else { /* the process was terminated with a signal! That's definitely * abnormal, so indicate that condition */ - child->state = ORTE_PROC_STATE_ABORTED_BY_SIG; + state = ORTE_PROC_STATE_ABORTED_BY_SIG; /* If a process was killed by a signal, then make the * exit code of orterun be "signo + 128" so that "prog" * and "orterun prog" will both yield the same exit code. @@ -2435,149 +2097,32 @@ void orte_base_default_waitpid_fired(orte_process_name_t *proc, int32_t status) * the termination code to exit status translation the * same way */ - child->exit_code = WTERMSIG(status) + 128; + proc->exit_code = WTERMSIG(status) + 128; OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls:waitpid_fired child process %s terminated with signal", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name) )); + ORTE_NAME_PRINT(&proc->name) )); /* Do not decrement the number of local procs here. That is handled in the errmgr */ } MOVEON: - /* indicate the waitpid fired */ - child->waitpid_recvd = true; - - /* now check to see if the proc is truly done */ - if (child->iof_complete) { - /* CHILD IS COMPLETE */ - child->alive = false; - - /* Release only the stdin IOF file descriptor for this child, if one - * was defined. File descriptors for the other IOF channels - stdout, - * stderr, and stddiag - were released when their associated pipes - * were cleared and closed due to termination of the process - */ - if (NULL != orte_iof.close) { - orte_iof.close(proc, ORTE_IOF_STDIN); - } - - /* Clean up the session directory as if we were the process - * itself. This covers the case where the process died abnormally - * and didn't cleanup its own session directory. - */ - orte_session_dir_finalize(proc); - /* alert the errmgr */ - if (ORTE_SUCCESS != (rc = orte_errmgr.update_state(ORTE_JOBID_INVALID, ORTE_JOB_STATE_UNDEF, - proc, child->state, child->pid, - child->exit_code))) { - ORTE_ERROR_LOG(rc); - } - } - - /* done */ - opal_condition_signal(&orte_odls_globals.cond); - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); -} - -/* - * Wait for a callback indicating the child has completed. - */ - -void odls_base_default_wait_local_proc(pid_t pid, int status, void* cbdata) -{ - orte_odls_child_t *child; - opal_list_item_t *item, *next; - int rc; - opal_buffer_t cmdbuf; - orte_daemon_cmd_flag_t command; - int32_t istatus; - - OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, - "%s odls:wait_local_proc child process %ld terminated", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - (long)pid)); - - /* since we are going to be working with the global list of - * children, we need to protect that list from modification - * by other threads. This will also be used to protect us - * from race conditions on any abort situation - */ - OPAL_THREAD_LOCK(&orte_odls_globals.mutex); - - /* find this child */ - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = next) { - child = (orte_odls_child_t*)item; - next = opal_list_get_next(item); - - if (pid == child->pid) { /* found it */ - /* this is an independent entry point from the event library. To avoid - * race conditions, we need to get back into the progression of messages - * and commands to be processed by the daemon. We do this by re-posting - * the event into the daemon cmd processor - */ - OBJ_CONSTRUCT(&cmdbuf, opal_buffer_t); - command = ORTE_DAEMON_WAITPID_FIRED; - if (ORTE_SUCCESS != (rc = opal_dss.pack(&cmdbuf, &command, 1, ORTE_DAEMON_CMD))) { - ORTE_ERROR_LOG(rc); - goto CLEANUP; - } - if (ORTE_SUCCESS != (rc = opal_dss.pack(&cmdbuf, child->name, 1, ORTE_NAME))) { - ORTE_ERROR_LOG(rc); - goto CLEANUP; - } - istatus = status; - if (ORTE_SUCCESS != (rc = opal_dss.pack(&cmdbuf, &istatus, 1, OPAL_INT32))) { - ORTE_ERROR_LOG(rc); - goto CLEANUP; - } - ORTE_MESSAGE_EVENT(ORTE_PROC_MY_NAME, &cmdbuf, ORTE_RML_TAG_DAEMON, orte_daemon_cmd_processor); - /* done */ - opal_condition_signal(&orte_odls_globals.cond); - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); - return; - } - } - /* get here if we didn't find the child, or if the specified child - * is already dead. If the latter, then we have a problem as it - * means we are detecting it exiting multiple times - */ - - OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, - "%s odls:wait_local_proc did not find pid %ld in table!", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - (long)pid)); - - /* it's just a race condition - don't error log it */ -CLEANUP: - opal_condition_signal(&orte_odls_globals.cond); - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); - return; + ORTE_ACTIVATE_PROC_STATE(&proc->name, state); } int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs, orte_odls_base_kill_local_fn_t kill_local, orte_odls_base_child_died_fn_t child_died) { - orte_odls_child_t *child; - opal_list_item_t *item; - int rc = ORTE_SUCCESS; + orte_proc_t *child; opal_list_t procs_killed; orte_proc_t *proc, proctmp; - int i; + int i, j; opal_pointer_array_t procarray, *procptr; bool do_cleanup; OBJ_CONSTRUCT(&procs_killed, opal_list_t); - /* since we are going to be working with the global list of - * children, we need to protect that list from modification - * by other threads - */ - OPAL_THREAD_LOCK(&orte_odls_globals.mutex); - /* if the pointer array is NULL, then just kill everything */ if (NULL == procs) { OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, @@ -2588,7 +2133,6 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs, OBJ_CONSTRUCT(&proctmp, orte_proc_t); proctmp.name.jobid = ORTE_JOBID_WILDCARD; proctmp.name.vpid = ORTE_VPID_WILDCARD; - ORTE_EPOCH_SET(proctmp.name.epoch,ORTE_EPOCH_WILDCARD); opal_pointer_array_add(&procarray, &proctmp); procptr = &procarray; do_cleanup = true; @@ -2605,29 +2149,28 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs, if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(procptr, i))) { continue; } - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; - + for(j=0; j < orte_local_children->size; j++) { + if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, j))) { + continue; + } + OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls:kill_local_proc checking child process %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name))); + ORTE_NAME_PRINT(&child->name))); /* do we have a child from the specified job? Because the * job could be given as a WILDCARD value, we must * check for that as well as for equality. */ if (ORTE_JOBID_WILDCARD != proc->name.jobid && - proc->name.jobid != child->name->jobid) { + proc->name.jobid != child->name.jobid) { OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls:kill_local_proc child %s is not part of job %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name), + ORTE_NAME_PRINT(&child->name), ORTE_JOBID_PRINT(proc->name.jobid))); - continue; } @@ -2635,14 +2178,13 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs, * appropriately */ if (ORTE_VPID_WILDCARD != proc->name.vpid && - proc->name.vpid != child->name->vpid) { + proc->name.vpid != child->name.vpid) { OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls:kill_local_proc child %s is not covered by rank %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name), + ORTE_NAME_PRINT(&child->name), ORTE_VPID_PRINT(proc->name.vpid))); - continue; } @@ -2654,14 +2196,13 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs, OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls:kill_local_proc child %s is not alive", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name))); + ORTE_NAME_PRINT(&child->name))); /* ensure, though, that the state is terminated so we don't lockup if * the proc never started */ if (ORTE_PROC_STATE_UNDEF == child->state || ORTE_PROC_STATE_INIT == child->state || - ORTE_PROC_STATE_LAUNCHED == child->state || ORTE_PROC_STATE_RUNNING == child->state) { /* we can't be sure what happened, but make sure we * at least have a value that will let us eventually wakeup @@ -2687,7 +2228,7 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs, * channels will automatically close when the proc is killed */ if (NULL != orte_iof.close) { - orte_iof.close(child->name, ORTE_IOF_STDIN); + orte_iof.close(&child->name, ORTE_IOF_STDIN); } /* cancel the waitpid callback as this induces unmanageable race @@ -2702,14 +2243,14 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs, OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s SENDING SIGCONT TO %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name))); + ORTE_NAME_PRINT(&child->name))); kill_local(child->pid, SIGCONT); /* Send a sigterm to the process before sigkill to be nice */ OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s SENDING SIGTERM TO %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name))); + ORTE_NAME_PRINT(&child->name))); kill_local(child->pid, SIGTERM); /* check to see if it died - the child_died function will continue @@ -2726,7 +2267,7 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs, OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s SENDING SIGKILL TO %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name))); + ORTE_NAME_PRINT(&child->name))); kill_local(child->pid, SIGKILL); /* Double check that it actually died this time */ if (!child_died(child)) { @@ -2745,7 +2286,7 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs, OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s SENDING FORCE SIGKILL TO %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name))); + ORTE_NAME_PRINT(&child->name))); kill_local(child->pid, SIGKILL); /* Double check that it actually died this time */ if (!child_died(child)) { @@ -2758,7 +2299,7 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs, OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls:kill_local_proc child %s killed", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name))); + ORTE_NAME_PRINT(&child->name))); /* indicate the waitpid fired as this is effectively what * has happened @@ -2770,18 +2311,12 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs, CLEANUP: /* ensure the child's session directory is cleaned up */ - orte_session_dir_finalize(child->name); + orte_session_dir_finalize(&child->name); /* check for everything complete - this will remove * the child object from our local list */ if (child->iof_complete && child->waitpid_recvd) { - rc = orte_errmgr.update_state(ORTE_JOBID_INVALID, ORTE_JOB_STATE_UNDEF, - child->name, child->state, child->pid, - child->exit_code); - if (ORTE_ERR_SILENT == rc) { - /* all procs are complete - we are done */ - break; - } + ORTE_ACTIVATE_PROC_STATE(&child->name, child->state); } } } @@ -2792,12 +2327,6 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs, OBJ_DESTRUCT(&proctmp); } - /* we are done with the global list, so we can now release - * any waiting threads - this also allows any callbacks to work - */ - opal_condition_signal(&orte_odls_globals.cond); - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); - return ORTE_SUCCESS; } @@ -2805,10 +2334,9 @@ int orte_odls_base_get_proc_stats(opal_buffer_t *answer, orte_process_name_t *proc) { int rc; - orte_odls_child_t *child; - opal_list_item_t *item, *next; + orte_proc_t *child; opal_pstats_t stats, *statsptr; - int j; + int i, j; OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls:get_proc_stats for proc %s", @@ -2816,14 +2344,13 @@ int orte_odls_base_get_proc_stats(opal_buffer_t *answer, ORTE_NAME_PRINT(proc))); /* find this child */ - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = next) { - child = (orte_odls_child_t*)item; - next = opal_list_get_next(item); + for (i=0; i < orte_local_children->size; i++) { + if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { + continue; + } - if (proc->jobid == child->name->jobid && - (proc->vpid == child->name->vpid || + if (proc->jobid == child->name.jobid && + (proc->vpid == child->name.vpid || ORTE_VPID_WILDCARD == proc->vpid)) { /* found it */ OBJ_CONSTRUCT(&stats, opal_pstats_t); @@ -2834,7 +2361,7 @@ int orte_odls_base_get_proc_stats(opal_buffer_t *answer, stats.node[j] = orte_process_info.nodename[j]; } /* record rank */ - stats.rank = child->name->vpid; + stats.rank = child->name.vpid; /* get stats */ rc = opal_pstat.query(child->pid, &stats, NULL); if (ORTE_SUCCESS != rc) { @@ -2859,22 +2386,18 @@ int orte_odls_base_get_proc_stats(opal_buffer_t *answer, return ORTE_SUCCESS; } -int orte_odls_base_default_restart_proc(orte_odls_child_t *child, +int orte_odls_base_default_restart_proc(orte_proc_t *child, orte_odls_base_fork_local_proc_fn_t fork_local) { int rc; orte_app_context_t *app; - opal_list_item_t *item; - orte_odls_job_t *jobdat; + orte_job_t *jobdat; char basedir[MAXPATHLEN]; - /* protect operations involving the global list of children */ - OPAL_THREAD_LOCK(&orte_odls_globals.mutex); - OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls:restart_proc for proc %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name))); + ORTE_NAME_PRINT(&child->name))); /* establish our baseline working directory - we will be potentially * bouncing around as we execute this app, but we will always return @@ -2883,16 +2406,7 @@ int orte_odls_base_default_restart_proc(orte_odls_child_t *child, getcwd(basedir, sizeof(basedir)); /* find this child's jobdat */ - jobdat = NULL; - for (item = opal_list_get_first(&orte_local_jobdata); - item != opal_list_get_end(&orte_local_jobdata); - item = opal_list_get_next(item)) { - jobdat = (orte_odls_job_t*)item; - if (jobdat->jobid == child->name->jobid) { - break; - } - } - if (NULL == jobdat) { + if (NULL == (jobdat = orte_get_job_data_object(child->name.jobid))) { /* not found */ ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); return ORTE_ERR_NOT_FOUND; @@ -2902,29 +2416,22 @@ int orte_odls_base_default_restart_proc(orte_odls_child_t *child, child->exit_code = 0; child->waitpid_recvd = false; child->iof_complete = false; - child->coll_recvd = false; child->pid = 0; - child->init_recvd = false; - child->fini_recvd = false; if (NULL != child->rml_uri) { free(child->rml_uri); child->rml_uri = NULL; } - app = (orte_app_context_t*)opal_pointer_array_get_item(&jobdat->apps, child->app_idx); + app = (orte_app_context_t*)opal_pointer_array_get_item(jobdat->apps, child->app_idx); /* reset envars to match this child */ if (ORTE_SUCCESS != (rc = setup_child(child, jobdat, &app->env))) { ORTE_ERROR_LOG(rc); - opal_condition_signal(&orte_odls_globals.cond); - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); goto CLEANUP; } /* setup the path */ if (ORTE_SUCCESS != (rc = setup_path(app))) { ORTE_ERROR_LOG(rc); - opal_condition_signal(&orte_odls_globals.cond); - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); goto CLEANUP; } @@ -2932,12 +2439,6 @@ int orte_odls_base_default_restart_proc(orte_odls_child_t *child, "%s restarting app %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), app->app)); - /* must unlock prior to fork to keep things clean in the - * event library - */ - opal_condition_signal(&orte_odls_globals.cond); - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); - rc = fork_local(app, child, app->env, jobdat); if (ORTE_SUCCESS == rc) { orte_wait_cb(child->pid, odls_base_default_wait_local_proc, NULL); @@ -2947,7 +2448,7 @@ int orte_odls_base_default_restart_proc(orte_odls_child_t *child, OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls:restart of proc %s %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name), + ORTE_NAME_PRINT(&child->name), (ORTE_SUCCESS == rc) ? "succeeded" : "failed")); /* reset our working directory back to our default location - if we @@ -2961,26 +2462,3 @@ int orte_odls_base_default_restart_proc(orte_odls_child_t *child, return rc; } - -bool orte_odls_base_default_check_finished(orte_process_name_t *proc) { - orte_odls_child_t *child; - opal_list_item_t *item; - orte_ns_cmp_bitmask_t mask; - - OPAL_THREAD_LOCK(&orte_odls_globals.mutex); - - /* find this child */ - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; - - mask = ORTE_NS_CMP_ALL; - - if (OPAL_EQUAL == orte_util_compare_name_fields(mask, proc, child->name)) { /* found it */ - return child->fini_recvd; - } - } - - return false; -} diff --git a/orte/mca/odls/base/odls_base_open.c b/orte/mca/odls/base/odls_base_open.c index afd81aa264..09d7906e4e 100644 --- a/orte/mca/odls/base/odls_base_open.c +++ b/orte/mca/odls/base/odls_base_open.c @@ -11,6 +11,8 @@ * All rights reserved. * Copyright (c) 2010-2011 Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -35,8 +37,8 @@ #include "opal/util/output.h" #include "opal/util/path.h" #include "opal/util/argv.h" -#include "opal/threads/threads.h" +#include "orte/mca/errmgr/errmgr.h" #include "orte/mca/plm/plm_types.h" #include "orte/util/name_fns.h" #include "orte/runtime/orte_globals.h" @@ -89,7 +91,7 @@ orte_odls_globals_t orte_odls_globals; int orte_odls_base_open(void) { char **ranks=NULL, *tmp; - int i, rank; + int rc, i, rank; orte_namelist_t *nm; bool xterm_hold; @@ -101,17 +103,17 @@ int orte_odls_base_open(void) "Time to wait for a process to die after issuing a kill signal to it", false, false, 1, &orte_odls_globals.timeout_before_sigkill); - /* initialize the global list of local children and job data */ - OBJ_CONSTRUCT(&orte_local_children, opal_list_t); - OBJ_CONSTRUCT(&orte_local_children_lock, opal_mutex_t); - OBJ_CONSTRUCT(&orte_local_children_cond, opal_condition_t); - OBJ_CONSTRUCT(&orte_local_jobdata, opal_list_t); - OBJ_CONSTRUCT(&orte_local_jobdata_lock, opal_mutex_t); - OBJ_CONSTRUCT(&orte_local_jobdata_cond, opal_condition_t); + /* initialize the global array of local children */ + orte_local_children = OBJ_NEW(opal_pointer_array_t); + if (OPAL_SUCCESS != (rc = opal_pointer_array_init(orte_local_children, + 1, + ORTE_GLOBAL_ARRAY_MAX_SIZE, + 1))) { + ORTE_ERROR_LOG(rc); + return rc; + } /* initialize ODLS globals */ - OBJ_CONSTRUCT(&orte_odls_globals.mutex, opal_mutex_t); - OBJ_CONSTRUCT(&orte_odls_globals.cond, opal_condition_t); OBJ_CONSTRUCT(&orte_odls_globals.xterm_ranks, opal_list_t); orte_odls_globals.xtermcmd = NULL; orte_odls_globals.dmap = NULL; @@ -133,7 +135,6 @@ int orte_odls_base_open(void) if (-1 == rank) { /* wildcard */ nm->name.vpid = ORTE_VPID_WILDCARD; - ORTE_EPOCH_SET(nm->name.epoch,ORTE_EPOCH_WILDCARD); } else if (rank < 0) { /* error out on bozo case */ orte_show_help("help-odls-base.txt", @@ -146,9 +147,8 @@ int orte_odls_base_open(void) * will be in the job - we'll check later */ nm->name.vpid = rank; - ORTE_EPOCH_SET(nm->name.epoch,orte_ess.proc_get_epoch(&nm->name)); } - opal_list_append(&orte_odls_globals.xterm_ranks, &nm->item); + opal_list_append(&orte_odls_globals.xterm_ranks, &nm->super); } opal_argv_free(ranks); /* construct the xtermcmd */ @@ -188,117 +188,20 @@ int orte_odls_base_open(void) return ORTE_SUCCESS; } -/* instance the child list object */ -static void orte_odls_child_constructor(orte_odls_child_t *ptr) +static void launch_local_const(orte_odls_launch_local_t *ptr) { - ptr->name = NULL; - ptr->restarts = 0; - ptr->pid = 0; - ptr->app_idx = 0; - ptr->alive = false; - ptr->coll_recvd = false; - /* set the default state to "failed to start" so - * we can correctly report should something - * go wrong during launch - */ - ptr->state = ORTE_PROC_STATE_FAILED_TO_START; - ptr->exit_code = 0; - ptr->init_recvd = false; - ptr->fini_recvd = false; - ptr->rml_uri = NULL; - ptr->waitpid_recvd = false; - ptr->iof_complete = false; - ptr->do_not_barrier = false; - ptr->notified = false; - OBJ_CONSTRUCT(&ptr->stats, opal_ring_buffer_t); - opal_ring_buffer_init(&ptr->stats, orte_stat_history_size); -#if OPAL_HAVE_HWLOC - ptr->cpu_bitmap = NULL; -#endif + ptr->ev = opal_event_alloc(); + ptr->job = ORTE_JOBID_INVALID; + ptr->fork_local = NULL; + ptr->retries = 0; } -static void orte_odls_child_destructor(orte_odls_child_t *ptr) +static void launch_local_dest(orte_odls_launch_local_t *ptr) { - opal_pstats_t *st; - - if (NULL != ptr->name) free(ptr->name); - if (NULL != ptr->rml_uri) free(ptr->rml_uri); - - while (NULL != (st = (opal_pstats_t*)opal_ring_buffer_pop(&ptr->stats))) { - OBJ_RELEASE(st); - } - OBJ_DESTRUCT(&ptr->stats); -#if OPAL_HAVE_HWLOC - if (NULL != ptr->cpu_bitmap) { - free(ptr->cpu_bitmap); - } -#endif + opal_event_free(ptr->ev); } -OBJ_CLASS_INSTANCE(orte_odls_child_t, - opal_list_item_t, - orte_odls_child_constructor, - orte_odls_child_destructor); - -static void orte_odls_job_constructor(orte_odls_job_t *ptr) -{ - OBJ_CONSTRUCT(&ptr->lock, opal_mutex_t); - OBJ_CONSTRUCT(&ptr->cond, opal_condition_t); - ptr->jobid = ORTE_JOBID_INVALID; - ptr->instance = NULL; - ptr->name = NULL; - ptr->state = ORTE_JOB_STATE_UNDEF; - ptr->launch_msg_processed = false; - OBJ_CONSTRUCT(&ptr->apps, opal_pointer_array_t); - opal_pointer_array_init(&ptr->apps, 2, INT_MAX, 2); - ptr->num_apps = 0; -#if OPAL_HAVE_HWLOC - ptr->binding = 0; -#endif - ptr->cpus_per_rank = 1; - ptr->stride = 1; - ptr->controls = 0; - ptr->stdin_target = ORTE_VPID_INVALID; - ptr->total_slots_alloc = 0; - ptr->num_procs = 0; - ptr->num_local_procs = 0; - ptr->pmap = NULL; - OBJ_CONSTRUCT(&ptr->collection_bucket, opal_buffer_t); - OBJ_CONSTRUCT(&ptr->local_collection, opal_buffer_t); - ptr->collective_type = ORTE_GRPCOMM_COLL_NONE; - ptr->num_contributors = 0; - ptr->num_participating = -1; - ptr->num_collected = 0; - ptr->enable_recovery = false; -} -static void orte_odls_job_destructor(orte_odls_job_t *ptr) -{ - int i; - orte_app_context_t *app; - - OBJ_DESTRUCT(&ptr->lock); - OBJ_DESTRUCT(&ptr->cond); - if (NULL != ptr->instance) { - free(ptr->instance); - } - if (NULL != ptr->name) { - free(ptr->name); - } - for (i=0; i < ptr->apps.size; i++) { - if (NULL != (app = (orte_app_context_t*)opal_pointer_array_get_item(&ptr->apps, i))) { - OBJ_RELEASE(app); - } - OBJ_DESTRUCT(&ptr->apps); - } - if (NULL != ptr->pmap && NULL != ptr->pmap->bytes) { - free(ptr->pmap->bytes); - free(ptr->pmap); - } - - OBJ_DESTRUCT(&ptr->collection_bucket); - OBJ_DESTRUCT(&ptr->local_collection); -} -OBJ_CLASS_INSTANCE(orte_odls_job_t, - opal_list_item_t, - orte_odls_job_constructor, - orte_odls_job_destructor); +OBJ_CLASS_INSTANCE(orte_odls_launch_local_t, + opal_object_t, + launch_local_const, + launch_local_dest); #endif diff --git a/orte/mca/odls/base/odls_base_state.c b/orte/mca/odls/base/odls_base_state.c index 2224209fb4..932c017580 100644 --- a/orte/mca/odls/base/odls_base_state.c +++ b/orte/mca/odls/base/odls_base_state.c @@ -77,17 +77,14 @@ int orte_odls_base_preload_files_app_context(orte_app_context_t* app_context) /* if I am the HNP, then use me as the source */ p_set->source.jobid = ORTE_PROC_MY_NAME->jobid; p_set->source.vpid = ORTE_PROC_MY_NAME->vpid; - ORTE_EPOCH_SET(p_set->source.epoch,ORTE_PROC_MY_NAME->epoch); } else { /* otherwise, set the HNP as the source */ p_set->source.jobid = ORTE_PROC_MY_HNP->jobid; p_set->source.vpid = ORTE_PROC_MY_HNP->vpid; - ORTE_EPOCH_SET(p_set->source.epoch,ORTE_PROC_MY_HNP->epoch); } p_set->sink.jobid = ORTE_PROC_MY_NAME->jobid; p_set->sink.vpid = ORTE_PROC_MY_NAME->vpid; - ORTE_EPOCH_SET(p_set->sink.epoch,ORTE_PROC_MY_NAME->epoch); opal_list_append(&(filem_request->process_sets), &(p_set->super) ); diff --git a/orte/mca/odls/base/odls_private.h b/orte/mca/odls/base/odls_private.h index 9733add738..3a1030eef1 100644 --- a/orte/mca/odls/base/odls_private.h +++ b/orte/mca/odls/base/odls_private.h @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -31,8 +33,6 @@ #include "opal/class/opal_list.h" #include "opal/class/opal_pointer_array.h" #include "opal/class/opal_bitmap.h" -#include "opal/threads/mutex.h" -#include "opal/threads/condition.h" #include "opal/dss/dss_types.h" #include "orte/mca/grpcomm/grpcomm_types.h" @@ -52,14 +52,10 @@ typedef struct { int output; /** Time to allow process to forcibly die */ int timeout_before_sigkill; - /* mutex */ - opal_mutex_t mutex; - /* condition variable */ - opal_condition_t cond; /* byte object to store daemon map for later xmit to procs */ opal_byte_object_t *dmap; /* any co-spawned debugger daemon */ - orte_odls_job_t *debugger; + orte_job_t *debugger; /* debugger launched */ bool debugger_launched; /* list of ranks to be displayed on separate xterms */ @@ -80,22 +76,39 @@ ORTE_DECLSPEC int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data, orte_jobid_t job); -ORTE_DECLSPEC int -orte_odls_base_default_update_daemon_info(opal_buffer_t *data); - ORTE_DECLSPEC int orte_odls_base_default_construct_child_list(opal_buffer_t *data, orte_jobid_t *job); /* define a function that will fork a local proc */ typedef int (*orte_odls_base_fork_local_proc_fn_t)(orte_app_context_t *context, - orte_odls_child_t *child, + orte_proc_t *child, char **environ_copy, - orte_odls_job_t *jobdat); + orte_job_t *jdata); -ORTE_DECLSPEC int -orte_odls_base_default_launch_local(orte_jobid_t job, - orte_odls_base_fork_local_proc_fn_t fork_local); +/* define an object for starting local launch */ +typedef struct { + opal_object_t object; + opal_event_t *ev; + orte_jobid_t job; + orte_odls_base_fork_local_proc_fn_t fork_local; + int retries; +} orte_odls_launch_local_t; +OBJ_CLASS_DECLARATION(orte_odls_launch_local_t); + +#define ORTE_ACTIVATE_LOCAL_LAUNCH(j, f) \ + do { \ + orte_odls_launch_local_t *ll; \ + ll = OBJ_NEW(orte_odls_launch_local_t); \ + ll->job = (j); \ + ll->fork_local = (f); \ + opal_event_set(orte_event_base, ll->ev, -1, OPAL_EV_WRITE, \ + orte_odls_base_default_launch_local, ll); \ + opal_event_set_priority(ll->ev, ORTE_SYS_PRI); \ + opal_event_active(ll->ev, OPAL_EV_WRITE, 1); \ + } while(0); + +ORTE_DECLSPEC void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata); ORTE_DECLSPEC int orte_odls_base_default_deliver_message(orte_jobid_t job, opal_buffer_t *buffer, orte_rml_tag_t tag); @@ -115,7 +128,7 @@ orte_odls_base_default_signal_local_procs(const orte_process_name_t *proc, int32 typedef int (*orte_odls_base_kill_local_fn_t)(pid_t pid, int signum); /* define a function type to detect that a child died */ -typedef bool (*orte_odls_base_child_died_fn_t)(orte_odls_child_t *child); +typedef bool (*orte_odls_base_child_died_fn_t)(orte_proc_t *child); ORTE_DECLSPEC int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs, @@ -126,7 +139,7 @@ ORTE_DECLSPEC int orte_odls_base_default_require_sync(orte_process_name_t *proc, opal_buffer_t *buffer, bool drop_nidmap); -ORTE_DECLSPEC int orte_odls_base_default_restart_proc(orte_odls_child_t *child, +ORTE_DECLSPEC int orte_odls_base_default_restart_proc(orte_proc_t *child, orte_odls_base_fork_local_proc_fn_t fork_local); /* diff --git a/orte/mca/odls/default/odls_default_module.c b/orte/mca/odls/default/odls_default_module.c index d2b1b13a83..0a3ca7a202 100644 --- a/orte/mca/odls/default/odls_default_module.c +++ b/orte/mca/odls/default/odls_default_module.c @@ -13,6 +13,8 @@ * Copyright (c) 2007 Evergrid, Inc. All rights reserved. * Copyright (c) 2008-2011 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2010 IBM Corporation. All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights + * reserved. * * $COPYRIGHT$ * @@ -155,7 +157,7 @@ typedef struct { static int orte_odls_default_launch_local_procs(opal_buffer_t *data); static int orte_odls_default_kill_local_procs(opal_pointer_array_t *procs); static int orte_odls_default_signal_local_procs(const orte_process_name_t *proc, int32_t signal); -static int orte_odls_default_restart_proc(orte_odls_child_t *child); +static int orte_odls_default_restart_proc(orte_proc_t *child); /* * Explicitly declared functions so that we can get the noreturn @@ -165,9 +167,9 @@ static void send_error_show_help(int fd, int exit_status, const char *file, const char *topic, ...) __opal_attribute_noreturn__; static int do_child(orte_app_context_t* context, - orte_odls_child_t *child, + orte_proc_t *child, char **environ_copy, - orte_odls_job_t *jobdat, int write_fd, + orte_job_t *jobdat, int write_fd, orte_iof_base_io_conf_t opts) __opal_attribute_noreturn__; @@ -186,7 +188,7 @@ orte_odls_base_module_t orte_odls_default_module = { }; -static bool odls_default_child_died(orte_odls_child_t *child) +static bool odls_default_child_died(orte_proc_t *child) { time_t end; pid_t ret; @@ -381,9 +383,9 @@ static void send_error_show_help(int fd, int exit_status, } static int do_child(orte_app_context_t* context, - orte_odls_child_t *child, + orte_proc_t *child, char **environ_copy, - orte_odls_job_t *jobdat, int write_fd, + orte_job_t *jobdat, int write_fd, orte_iof_base_io_conf_t opts) { int i; @@ -443,7 +445,7 @@ static int do_child(orte_app_context_t* context, if (NULL == msg) { msg = "failed to convert bitmap list to hwloc bitmap"; } - if (OPAL_BINDING_REQUIRED(jobdat->binding)) { + if (OPAL_BINDING_REQUIRED(jobdat->map->binding)) { /* If binding is required, send an error up the pipe (which exits -- it doesn't return). */ send_error_show_help(write_fd, 1, "help-orte-odls-default.txt", @@ -463,7 +465,7 @@ static int do_child(orte_app_context_t* context, if (opal_hwloc_report_bindings) { opal_output(0, "%s odls:default binding child %s to cpus %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name), child->cpu_bitmap); + ORTE_NAME_PRINT(&child->name), child->cpu_bitmap); } rc = hwloc_set_cpubind(opal_hwloc_topology, cpuset, 0); if (rc < 0) { @@ -476,7 +478,7 @@ static int do_child(orte_app_context_t* context, asprintf(&msg, "hwloc_set_cpubind returned \"%s\" for bitmap \"%s\"", opal_strerror(rc), child->cpu_bitmap); } - if (OPAL_BINDING_REQUIRED(jobdat->binding)) { + if (OPAL_BINDING_REQUIRED(jobdat->map->binding)) { /* If binding is required, send an error up the pipe (which exits -- it doesn't return). */ send_error_show_help(write_fd, 1, "help-orte-odls-default.txt", @@ -615,9 +617,9 @@ static int do_child(orte_app_context_t* context, static int do_parent(orte_app_context_t* context, - orte_odls_child_t *child, + orte_proc_t *child, char **environ_copy, - orte_odls_job_t *jobdat, int read_fd, + orte_job_t *jobdat, int read_fd, orte_iof_base_io_conf_t opts) { int rc; @@ -626,7 +628,7 @@ static int do_parent(orte_app_context_t* context, if (NULL != child && (ORTE_JOB_CONTROL_FORWARD_OUTPUT & jobdat->controls)) { /* connect endpoints IOF */ - rc = orte_iof_base_setup_parent(child->name, &opts); + rc = orte_iof_base_setup_parent(&child->name, &opts); if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); close(read_fd); @@ -637,7 +639,7 @@ static int do_parent(orte_app_context_t* context, return rc; } } - + /* Block reading a message from the pipe */ while (1) { rc = opal_fd_read(read_fd, sizeof(msg), &msg); @@ -734,7 +736,7 @@ static int do_parent(orte_app_context_t* context, indication of a fatal error, meaning that the child process launched successfully. */ if (NULL != child) { - child->state = ORTE_PROC_STATE_LAUNCHED; + child->state = ORTE_PROC_STATE_RUNNING; child->alive = true; } close(read_fd); @@ -747,9 +749,9 @@ static int do_parent(orte_app_context_t* context, * Fork/exec the specified processes */ static int odls_default_fork_local_proc(orte_app_context_t* context, - orte_odls_child_t *child, + orte_proc_t *child, char **environ_copy, - orte_odls_job_t *jobdat) + orte_job_t *jobdat) { orte_iof_base_io_conf_t opts; int rc, p[2]; @@ -762,7 +764,8 @@ static int odls_default_fork_local_proc(orte_app_context_t* context, /* do we want to setup stdin? */ if (NULL != child && - (jobdat->stdin_target == ORTE_VPID_WILDCARD || child->name->vpid == jobdat->stdin_target)) { + (jobdat->stdin_target == ORTE_VPID_WILDCARD || + child->name.vpid == jobdat->stdin_target)) { opts.connect_stdin = true; } else { opts.connect_stdin = false; @@ -777,7 +780,7 @@ static int odls_default_fork_local_proc(orte_app_context_t* context, return rc; } } - + /* A pipe is used to communicate between the parent and child to indicate whether the exec ultimately succeeded or failed. The child sets the pipe to be close-on-exec; the child only ever @@ -829,49 +832,24 @@ int orte_odls_default_launch_local_procs(opal_buffer_t *data) { int rc; orte_jobid_t job; - orte_job_t *jdata; /* construct the list of children we are to launch */ if (ORTE_SUCCESS != (rc = orte_odls_base_default_construct_child_list(data, &job))) { OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output, "%s odls:default:launch:local failed to construct child list on error %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_ERROR_NAME(rc))); - goto CLEANUP; + return rc; } /* launch the local procs */ - if (ORTE_SUCCESS != (rc = orte_odls_base_default_launch_local(job, odls_default_fork_local_proc))) { - OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output, - "%s odls:default:launch:local failed to launch on error %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_ERROR_NAME(rc))); - goto CLEANUP; - } + ORTE_ACTIVATE_LOCAL_LAUNCH(job, odls_default_fork_local_proc); - /* look up job data object */ - if (NULL != (jdata = orte_get_job_data_object(job))) { - if (jdata->state & ORTE_JOB_STATE_SUSPENDED) { - if (ORTE_PROC_IS_HNP) { - /* Have the plm send the signal to all the nodes. - If the signal arrived before the orteds started, - then they won't know to suspend their procs. - The plm also arranges for any local procs to - be signaled. - */ - orte_plm.signal_job(jdata->jobid, SIGTSTP); - } else { - orte_odls_default_signal_local_procs(NULL, SIGTSTP); - } - } - } - -CLEANUP: - - return rc; + return ORTE_SUCCESS; } /** - * Send a sigal to a pid. Note that if we get an error, we set the + * Send a signal to a pid. Note that if we get an error, we set the * return value and let the upper layer print out the message. */ static int send_signal(pid_t pid, int signal) @@ -922,7 +900,7 @@ static int orte_odls_default_signal_local_procs(const orte_process_name_t *proc, return ORTE_SUCCESS; } -static int orte_odls_default_restart_proc(orte_odls_child_t *child) +static int orte_odls_default_restart_proc(orte_proc_t *child) { int rc; diff --git a/orte/mca/odls/odls.h b/orte/mca/odls/odls.h index ea09f89c65..a36ba9e26d 100644 --- a/orte/mca/odls/odls.h +++ b/orte/mca/odls/odls.h @@ -10,6 +10,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -34,6 +36,8 @@ #include "opal/dss/dss_types.h" #include "orte/mca/rml/rml_types.h" +#include "orte/runtime/orte_globals.h" + #include "orte/mca/odls/odls_types.h" BEGIN_C_DECLS @@ -85,7 +89,7 @@ typedef int (*orte_odls_base_module_require_sync_fn_t)(orte_process_name_t *proc /** * Restart a local process */ -typedef int (*orte_odls_base_module_restart_proc_fn_t)(orte_odls_child_t *child); +typedef int (*orte_odls_base_module_restart_proc_fn_t)(orte_proc_t *child); /** * pls module version @@ -94,7 +98,7 @@ struct orte_odls_base_module_1_3_0_t { orte_odls_base_module_get_add_procs_data_fn_t get_add_procs_data; orte_odls_base_module_launch_local_processes_fn_t launch_local_procs; orte_odls_base_module_kill_local_processes_fn_t kill_local_procs; - orte_odls_base_module_signal_local_process_fn_t signal_local_procs; + orte_odls_base_module_signal_local_process_fn_t signal_local_procs; orte_odls_base_module_deliver_message_fn_t deliver_message; orte_odls_base_module_require_sync_fn_t require_sync; orte_odls_base_module_restart_proc_fn_t restart_proc; diff --git a/orte/mca/odls/odls_types.h b/orte/mca/odls/odls_types.h index 5c419d15e2..fa5f9e5ea9 100644 --- a/orte/mca/odls/odls_types.h +++ b/orte/mca/odls/odls_types.h @@ -9,7 +9,9 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved. - * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -32,9 +34,8 @@ #include "opal/class/opal_list.h" #include "opal/class/opal_ring_buffer.h" #include "opal/dss/dss_types.h" -#include "opal/threads/mutex.h" -#include "opal/threads/condition.h" #include "opal/mca/hwloc/hwloc.h" +#include "opal/mca/event/event.h" #include "orte/mca/plm/plm_types.h" #include "orte/mca/grpcomm/grpcomm_types.h" @@ -72,10 +73,6 @@ typedef uint8_t orte_daemon_cmd_flag_t; #define ORTE_DAEMON_TERMINATE_JOB_CMD (orte_daemon_cmd_flag_t) 18 #define ORTE_DAEMON_HALT_VM_CMD (orte_daemon_cmd_flag_t) 19 -/* proc termination sync cmds */ -#define ORTE_DAEMON_WAITPID_FIRED (orte_daemon_cmd_flag_t) 20 -#define ORTE_DAEMON_IOF_COMPLETE (orte_daemon_cmd_flag_t) 21 - /* request proc resource usage */ #define ORTE_DAEMON_TOP_CMD (orte_daemon_cmd_flag_t) 22 @@ -92,76 +89,6 @@ typedef uint8_t orte_daemon_cmd_flag_t; /* process called "errmgr.abort_procs" */ #define ORTE_DAEMON_ABORT_PROCS_CALLED (orte_daemon_cmd_flag_t) 28 -/* - * List object to locally store the process names and pids of - * our children. This can subsequently be used to order termination - * or pass signals without looking the info up again. - */ -typedef struct { - opal_list_item_t super; /* required to place this on a list */ - orte_process_name_t *name; /* the OmpiRTE name of the proc */ - int32_t restarts; /* number of times this proc has been restarted */ - pid_t pid; /* local pid of the proc */ - orte_app_idx_t app_idx; /* index of the app_context for this proc */ - bool alive; /* is this proc alive? */ - bool coll_recvd; /* collective operation recvd */ - orte_proc_state_t state; /* the state of the process */ - orte_exit_code_t exit_code; /* process exit code */ - bool init_recvd; /* process called orte_init */ - bool fini_recvd; /* process called orte_finalize */ - char *rml_uri; /* contact info for this child */ -#if OPAL_HAVE_HWLOC - char *cpu_bitmap; /* binding pattern for this child */ -#endif - bool waitpid_recvd; /* waitpid has detected proc termination */ - bool iof_complete; /* IOF has noted proc terminating all channels */ - struct timeval starttime; /* when the proc was started - for timing purposes only */ - bool do_not_barrier; /* the proc should not barrier in orte_init */ - bool notified; /* notification of termination has been sent */ - opal_ring_buffer_t stats; -} orte_odls_child_t; -ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_odls_child_t); - -#if !ORTE_DISABLE_FULL_SUPPORT - -/* - * List object to locally store job related info - */ -typedef struct orte_odls_job_t { - opal_list_item_t super; /* required to place this on a list */ - opal_mutex_t lock; - opal_condition_t cond; - orte_job_state_t state; /* state of the job */ - orte_jobid_t jobid; /* jobid for this data */ - char *instance; /* keep handy for scheduler restart */ - char *name; /* keep handy for scheduler restart */ - bool launch_msg_processed; /* launch msg has been fully processed */ - opal_pointer_array_t apps; /* app_contexts for this job */ - orte_app_idx_t num_apps; /* number of app_contexts */ -#if OPAL_HAVE_HWLOC - opal_binding_policy_t binding; /* binding policy */ -#endif - int16_t cpus_per_rank; /* number of cpus/rank */ - int16_t stride; /* step size between cores of multi-core/rank procs */ - orte_job_controls_t controls; /* control flags for job */ - orte_vpid_t stdin_target; /* where stdin is to go */ - orte_std_cntr_t total_slots_alloc; - orte_std_cntr_t num_nodes; /* number of nodes involved in the job */ - orte_vpid_t num_procs; - int32_t num_local_procs; - opal_byte_object_t *pmap; /* local copy of pidmap byte object */ - opal_buffer_t collection_bucket; - opal_buffer_t local_collection; - orte_grpcomm_coll_t collective_type; - int32_t num_contributors; - int num_participating; - int num_collected; - struct timeval launch_msg_recvd; /* when the launch msg for this job was recvd - for timing purposes only */ - bool enable_recovery; /* enable recovery of failed processes */ -} orte_odls_job_t; -ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_odls_job_t); - -#endif END_C_DECLS diff --git a/orte/mca/odls/process/odls_process_module.c b/orte/mca/odls/process/odls_process_module.c index b289f1d992..f6b75fc7d3 100644 --- a/orte/mca/odls/process/odls_process_module.c +++ b/orte/mca/odls/process/odls_process_module.c @@ -50,7 +50,7 @@ static void set_handler_default(int sig); -static bool odls_process_child_died( orte_odls_child_t *child ) +static bool odls_process_child_died( orte_proc_t *child ) { int error; HANDLE handle = OpenProcess( PROCESS_TERMINATE | SYNCHRONIZE, FALSE, @@ -92,9 +92,9 @@ static int odls_process_kill_local_procs(opal_pointer_array_t *procs) */ static int odls_process_fork_local_proc(orte_app_context_t* context, - orte_odls_child_t *child, + orte_proc_t *child, char **environ_copy, - orte_odls_job_t *jobdat) + orte_job_t *jobdat) { pid_t pid; orte_iof_base_io_conf_t opts; @@ -108,7 +108,7 @@ static int odls_process_fork_local_proc(orte_app_context_t* context, */ if (opal_sys_limits.initialized) { if (0 < opal_sys_limits.num_procs && - opal_sys_limits.num_procs <= (int)opal_list_get_size(&orte_local_children)) { + opal_sys_limits.num_procs <= *(&orte_local_children->size)) { /* at the system limit - abort */ ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_CHILDREN); child->state = ORTE_PROC_STATE_FAILED_TO_START; @@ -122,7 +122,7 @@ static int odls_process_fork_local_proc(orte_app_context_t* context, opts.usepty = OPAL_ENABLE_PTY_SUPPORT; /* do we want to setup stdin? */ - if (jobdat->stdin_target == ORTE_VPID_WILDCARD || child->name->vpid == jobdat->stdin_target) { + if (jobdat->stdin_target == ORTE_VPID_WILDCARD || child->name.vpid == jobdat->stdin_target) { opts.connect_stdin = true; } else { opts.connect_stdin = false; @@ -161,16 +161,10 @@ static int odls_process_fork_local_proc(orte_app_context_t* context, } /* set the proc state to LAUNCHED and save the pid */ - child->state = ORTE_PROC_STATE_LAUNCHED; + child->state = ORTE_PROC_STATE_RUNNING; child->pid = pid; child->alive = true; - - /* Windows automatically forwards IO, so we don't need to do so here. However, - * we need to flag that IO termination conditions are met so that the daemon - * knows the proc is done - */ - orte_odls_base_notify_iof_complete(child->name); - + return ORTE_SUCCESS; } @@ -193,12 +187,7 @@ static int odls_process_launch_local_procs(opal_buffer_t *data) } /* launch the local procs */ - if (ORTE_SUCCESS != (rc = orte_odls_base_default_launch_local(job, odls_process_fork_local_proc))) { - OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output, - "%s odls:process:launch:local failed to launch on error %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_ERROR_NAME(rc))); - goto CLEANUP; - } + ORTE_ACTIVATE_LOCAL_LAUNCH(job, odls_process_fork_local_proc); CLEANUP: @@ -220,7 +209,7 @@ static int odls_process_signal_local_proc(const orte_process_name_t *proc, int32 return rc; } -static int orte_odls_process_restart_proc(orte_odls_child_t *child) +static int orte_odls_process_restart_proc(orte_proc_t *child) { int rc; diff --git a/orte/mca/oob/tcp/oob_tcp.c b/orte/mca/oob/tcp/oob_tcp.c index aa8b6ebd10..e6067197d1 100644 --- a/orte/mca/oob/tcp/oob_tcp.c +++ b/orte/mca/oob/tcp/oob_tcp.c @@ -9,7 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2006-2010 Los Alamos National Security, LLC. + * Copyright (c) 2006-2012 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2009-2012 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. @@ -70,13 +70,16 @@ */ struct mca_oob_tcp_event_t { opal_list_item_t item; - opal_event_t event; + opal_event_t *event; }; typedef struct mca_oob_tcp_event_t mca_oob_tcp_event_t; static void mca_oob_tcp_event_construct(mca_oob_tcp_event_t* event) { OPAL_THREAD_LOCK(&mca_oob_tcp_component.tcp_lock); + /* get an event */ + event->event = opal_event_alloc(); + /* track our events */ opal_list_append(&mca_oob_tcp_component.tcp_events, &event->item); OPAL_THREAD_UNLOCK(&mca_oob_tcp_component.tcp_lock); } @@ -84,6 +87,9 @@ static void mca_oob_tcp_event_construct(mca_oob_tcp_event_t* event) static void mca_oob_tcp_event_destruct(mca_oob_tcp_event_t* event) { OPAL_THREAD_LOCK(&mca_oob_tcp_component.tcp_lock); + /* release the event for re-use */ + opal_event_free(event->event); + /* remove it from our list */ opal_list_remove_item(&mca_oob_tcp_component.tcp_events, &event->item); OPAL_THREAD_UNLOCK(&mca_oob_tcp_component.tcp_lock); } @@ -414,6 +420,12 @@ static int mca_oob_tcp_component_open(void) OBJ_CONSTRUCT(&mca_oob_tcp_component.tcp_connections_return, opal_list_t); OBJ_CONSTRUCT(&mca_oob_tcp_component.tcp_connections_lock, opal_mutex_t); + mca_oob_tcp_component.tcp_recv_event = NULL; +#if OPAL_WANT_IPV6 + mca_oob_tcp_component.tcp6_recv_event = NULL; +#endif + + mca_oob_tcp_component.tcp_listen_thread_event = NULL; mca_oob_tcp_component.tcp_listen_thread_num_sockets = 0; mca_oob_tcp_component.tcp_listen_thread_sds[0] = -1; mca_oob_tcp_component.tcp_listen_thread_sds[1] = -1; @@ -507,8 +519,9 @@ mca_oob_tcp_create_connection(const int accepted_fd, /* wait for receipt of peers process identifier to complete this connection */ event = OBJ_NEW(mca_oob_tcp_event_t); - opal_event_set(opal_event_base, &event->event, accepted_fd, OPAL_EV_READ, mca_oob_tcp_recv_handler, event); - opal_event_add(&event->event, 0); + opal_event_set(orte_event_base, event->event, accepted_fd, OPAL_EV_READ, mca_oob_tcp_recv_handler, event); + opal_event_set_priority(event->event, ORTE_MSG_PRI); + opal_event_add(event->event, 0); } @@ -1098,16 +1111,25 @@ mca_oob_tcp_accept_thread_handler(int sd, short flags, void* user) tv.tv_sec = mca_oob_tcp_component.tcp_listen_thread_tv.tv_sec; tv.tv_usec = mca_oob_tcp_component.tcp_listen_thread_tv.tv_usec; #ifdef HAVE_PIPE - opal_event_set(opal_event_base, &mca_oob_tcp_component.tcp_listen_thread_event, - mca_oob_tcp_component.tcp_connections_pipe[0], - OPAL_EV_READ, - mca_oob_tcp_accept_thread_handler, NULL); + if (NULL == mca_oob_tcp_component.tcp_listen_thread_event) { + /* get an event */ + mca_oob_tcp_component.tcp_listen_thread_event = opal_event_alloc(); + opal_event_set(orte_event_base, mca_oob_tcp_component.tcp_listen_thread_event, + mca_oob_tcp_component.tcp_connections_pipe[0], + OPAL_EV_READ, + mca_oob_tcp_accept_thread_handler, NULL); + } #else - opal_event_set(opal_event_base, &mca_oob_tcp_component.tcp_listen_thread_event, - -1, 0, - mca_oob_tcp_accept_thread_handler, NULL); + if (NULL == mca_oob_tcp_component.tcp_listen_thread_event) { + /* get an event */ + mca_oob_tcp_component.tcp_listen_thread_event = opal_event_alloc(); + opal_event_set(orte_event_base, mca_oob_tcp_component.tcp_listen_thread_event, + -1, 0, + mca_oob_tcp_accept_thread_handler, NULL); + } #endif - opal_event_add(&mca_oob_tcp_component.tcp_listen_thread_event, &tv); + opal_event_set_priority(mca_oob_tcp_component.tcp_listen_thread_event, ORTE_MSG_PRI); + opal_event_add(mca_oob_tcp_component.tcp_listen_thread_event, &tv); } @@ -1134,16 +1156,25 @@ mca_oob_tcp_create_listen_thread(void) tv.tv_sec = mca_oob_tcp_component.tcp_listen_thread_tv.tv_sec; tv.tv_usec = mca_oob_tcp_component.tcp_listen_thread_tv.tv_usec; #ifdef HAVE_PIPE - opal_event_set(opal_event_base, &mca_oob_tcp_component.tcp_listen_thread_event, - mca_oob_tcp_component.tcp_connections_pipe[0], - OPAL_EV_READ, - mca_oob_tcp_accept_thread_handler, NULL); + if (NULL == mca_oob_tcp_component.tcp_listen_thread_event) { + /* get an event */ + mca_oob_tcp_component.tcp_listen_thread_event = opal_event_alloc(); + opal_event_set(orte_event_base, mca_oob_tcp_component.tcp_listen_thread_event, + mca_oob_tcp_component.tcp_connections_pipe[0], + OPAL_EV_READ, + mca_oob_tcp_accept_thread_handler, NULL); + } #else - opal_event_set(opal_event_base, &mca_oob_tcp_component.tcp_listen_thread_event, - -1, 0, - mca_oob_tcp_accept_thread_handler, NULL); + if (NULL == mca_oob_tcp_component.tcp_listen_thread_event) { + /* get an event */ + mca_oob_tcp_component.tcp_listen_thread_event = opal_event_alloc(); + opal_event_set(orte_event_base, mca_oob_tcp_component.tcp_listen_thread_event, + -1, 0, + mca_oob_tcp_accept_thread_handler, NULL); + } #endif - opal_event_add(&mca_oob_tcp_component.tcp_listen_thread_event, &tv); + opal_event_set_priority(mca_oob_tcp_component.tcp_listen_thread_event, ORTE_MSG_PRI); + opal_event_add(mca_oob_tcp_component.tcp_listen_thread_event, &tv); return opal_thread_start(&mca_oob_tcp_component.tcp_listen_thread); } @@ -1570,12 +1601,6 @@ mca_oob_t* mca_oob_tcp_component_init(int* priority) 8); /* increment to grow by */ - /* intialize event library */ - memset(&mca_oob_tcp_component.tcp_recv_event, 0, sizeof(opal_event_t)); - memset(&mca_oob_tcp_component.tcp_listen_thread_event, 0, sizeof(opal_event_t)); -#if OPAL_WANT_IPV6 - memset(&mca_oob_tcp_component.tcp6_recv_event, 0, sizeof(opal_event_t)); -#endif /* OPAL_WANT_IPV6 */ return &mca_oob_tcp; } @@ -1767,12 +1792,16 @@ int mca_oob_tcp_init(void) mca_oob_tcp_component.tcp_listen_thread_sds[idx] = mca_oob_tcp_component.tcp_listen_sd; } else { - opal_event_set(opal_event_base, &mca_oob_tcp_component.tcp_recv_event, - mca_oob_tcp_component.tcp_listen_sd, - OPAL_EV_READ|OPAL_EV_PERSIST, - mca_oob_tcp_recv_handler, - 0); - opal_event_add(&mca_oob_tcp_component.tcp_recv_event, 0); + if (NULL == mca_oob_tcp_component.tcp_recv_event) { + mca_oob_tcp_component.tcp_recv_event = opal_event_alloc(); + opal_event_set(orte_event_base, mca_oob_tcp_component.tcp_recv_event, + mca_oob_tcp_component.tcp_listen_sd, + OPAL_EV_READ|OPAL_EV_PERSIST, + mca_oob_tcp_recv_handler, + 0); + opal_event_set_priority(mca_oob_tcp_component.tcp_recv_event, ORTE_MSG_PRI); + opal_event_add(mca_oob_tcp_component.tcp_recv_event, 0); + } } } @@ -1799,12 +1828,16 @@ int mca_oob_tcp_init(void) mca_oob_tcp_component.tcp_listen_thread_sds[idx] = mca_oob_tcp_component.tcp6_listen_sd; } else { - opal_event_set(opal_event_base, &mca_oob_tcp_component.tcp6_recv_event, - mca_oob_tcp_component.tcp6_listen_sd, - OPAL_EV_READ|OPAL_EV_PERSIST, - mca_oob_tcp_recv_handler, - 0); - opal_event_add(&mca_oob_tcp_component.tcp6_recv_event, 0); + if (NULL == mca_oob_tcp_component.tcp6_recv_event) { + mca_oob_tcp_component.tcp6_recv_event = opal_event_alloc(); + opal_event_set(orte_event_base, mca_oob_tcp_component.tcp6_recv_event, + mca_oob_tcp_component.tcp6_listen_sd, + OPAL_EV_READ|OPAL_EV_PERSIST, + mca_oob_tcp_recv_handler, + 0); + opal_event_set_priority(mca_oob_tcp_component.tcp6_recv_event, ORTE_MSG_PRI); + opal_event_add(mca_oob_tcp_component.tcp6_recv_event, 0); + } } } #endif @@ -1857,14 +1890,21 @@ int mca_oob_tcp_fini(void) if (OOB_TCP_LISTEN_THREAD == mca_oob_tcp_component.tcp_listen_type) { mca_oob_tcp_component.tcp_shutdown = true; opal_thread_join(&mca_oob_tcp_component.tcp_listen_thread, &data); - opal_event_del(&mca_oob_tcp_component.tcp_listen_thread_event); + if (NULL != mca_oob_tcp_component.tcp_listen_thread_event) { + opal_event_free(mca_oob_tcp_component.tcp_listen_thread_event); + mca_oob_tcp_component.tcp_listen_thread_event = NULL; + } } else { - if (mca_oob_tcp_component.tcp_listen_sd >= 0) { - opal_event_del(&mca_oob_tcp_component.tcp_recv_event); + if (mca_oob_tcp_component.tcp_listen_sd >= 0 && + NULL != mca_oob_tcp_component.tcp_recv_event) { + opal_event_free(mca_oob_tcp_component.tcp_recv_event); + mca_oob_tcp_component.tcp_recv_event = NULL; } #if OPAL_WANT_IPV6 - if (mca_oob_tcp_component.tcp6_listen_sd >= 0) { - opal_event_del(&mca_oob_tcp_component.tcp6_recv_event); + if (mca_oob_tcp_component.tcp6_listen_sd >= 0 && + NULL != mca_oob_tcp_component.tcp6_recv_event) { + opal_event_free(mca_oob_tcp_component.tcp6_recv_event); + mca_oob_tcp_component.tcp6_recv_event = NULL; } #endif } @@ -1894,7 +1934,6 @@ int mca_oob_tcp_fini(void) item != opal_list_get_end(&mca_oob_tcp_component.tcp_events); item = opal_list_get_first(&mca_oob_tcp_component.tcp_events) ) { mca_oob_tcp_event_t* event = (mca_oob_tcp_event_t*)item; - opal_event_del(&event->event); OBJ_RELEASE(event); } diff --git a/orte/mca/oob/tcp/oob_tcp.h b/orte/mca/oob/tcp/oob_tcp.h index ca0ec8d223..ecb16f2703 100644 --- a/orte/mca/oob/tcp/oob_tcp.h +++ b/orte/mca/oob/tcp/oob_tcp.h @@ -9,7 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2006-2007 Los Alamos National Security, LLC. + * Copyright (c) 2006-2012 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ @@ -206,14 +206,14 @@ struct mca_oob_tcp_component_t { int tcp_sndbuf; /**< socket send buffer size */ int tcp_rcvbuf; /**< socket recv buffer size */ opal_free_list_t tcp_msgs; /**< free list of messages */ - opal_event_t tcp_recv_event; /**< event structure for IPv4 recvs */ + opal_event_t *tcp_recv_event; /**< event structure for IPv4 recvs */ int tcp_listen_sd; /**< listen socket for incoming IPv4 connection requests */ unsigned short tcp_listen_port; /**< IPv4 listen port */ char** tcp4_static_ports; /**< Static ports - IPV4 */ char** tcp4_dyn_ports; /**< Dynamic ports - IPV4 */ int disable_family; /**< disable AF: 0-nothing, 4-IPv4, 6-IPv6 */ #if OPAL_WANT_IPV6 - opal_event_t tcp6_recv_event; /**< event structure for IPv6 recvs */ + opal_event_t *tcp6_recv_event; /**< event structure for IPv6 recvs */ int tcp6_listen_sd; /**< listen socket for incoming IPv6 connection requests */ unsigned short tcp6_listen_port; /**< IPv6 listen port */ char** tcp6_static_ports; /**< Static ports - IPV6 */ @@ -239,7 +239,7 @@ struct mca_oob_tcp_component_t { opal_list_t tcp_connections_return; /**< List of connection fragments being returned to accept thread */ opal_mutex_t tcp_connections_lock; /**< Lock protecting pending_connections and connections_return */ int tcp_connections_pipe[2]; - opal_event_t tcp_listen_thread_event; + opal_event_t *tcp_listen_thread_event; int tcp_copy_max_size; /**< Max size of the copy list before copying must commence */ int tcp_listen_thread_num_sockets; /**< Number of sockets in tcp_listen_thread_sds */ diff --git a/orte/mca/oob/tcp/oob_tcp_msg.c b/orte/mca/oob/tcp/oob_tcp_msg.c index 2a00fc0b0a..0468404f4e 100644 --- a/orte/mca/oob/tcp/oob_tcp_msg.c +++ b/orte/mca/oob/tcp/oob_tcp_msg.c @@ -261,11 +261,13 @@ static bool mca_oob_tcp_msg_recv(mca_oob_tcp_msg_t* msg, mca_oob_tcp_peer_t* pee else if (opal_socket_errno == EAGAIN || opal_socket_errno == EWOULDBLOCK) { return false; } - opal_output(0, "%s-%s mca_oob_tcp_msg_recv: readv failed: %s (%d)", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&(peer->peer_name)), - strerror(opal_socket_errno), - opal_socket_errno); + if (mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_INFO) { + opal_output(0, "%s-%s mca_oob_tcp_msg_recv: readv failed: %s (%d)", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&(peer->peer_name)), + strerror(opal_socket_errno), + opal_socket_errno); + } mca_oob_tcp_peer_close(peer); if (NULL != mca_oob_tcp.oob_exception_callback) { mca_oob_tcp.oob_exception_callback(&peer->peer_name, ORTE_RML_PEER_DISCONNECTED); diff --git a/orte/mca/oob/tcp/oob_tcp_peer.c b/orte/mca/oob/tcp/oob_tcp_peer.c index cf6220c665..22476eb17c 100644 --- a/orte/mca/oob/tcp/oob_tcp_peer.c +++ b/orte/mca/oob/tcp/oob_tcp_peer.c @@ -9,7 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2006-2007 Los Alamos National Security, LLC. + * Copyright (c) 2006-2011 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. @@ -60,6 +60,7 @@ #include "opal/mca/event/event.h" #include "orte/util/name_fns.h" +#include "orte/mca/state/state.h" #include "orte/runtime/orte_globals.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/ess/ess.h" @@ -102,12 +103,13 @@ static void mca_oob_tcp_peer_construct(mca_oob_tcp_peer_t* peer) { OBJ_CONSTRUCT(&(peer->peer_send_queue), opal_list_t); OBJ_CONSTRUCT(&(peer->peer_lock), opal_mutex_t); - memset(&peer->peer_send_event, 0, sizeof(peer->peer_send_event)); - memset(&peer->peer_recv_event, 0, sizeof(peer->peer_recv_event)); peer->peer_sd = -1; peer->peer_current_af = AF_UNSPEC; - memset(&peer->peer_timer_event, 0, sizeof(peer->peer_timer_event)); - opal_event_evtimer_set(opal_event_base, &peer->peer_timer_event, mca_oob_tcp_peer_timer_handler, peer); + /* get events */ + peer->peer_send_event = opal_event_alloc(); + peer->peer_recv_event = opal_event_alloc(); + peer->peer_timer_event = opal_event_alloc(); + opal_event_evtimer_set(orte_event_base, peer->peer_timer_event, mca_oob_tcp_peer_timer_handler, peer); } /* @@ -133,22 +135,21 @@ static void mca_oob_tcp_peer_destruct(mca_oob_tcp_peer_t * peer) */ static int mca_oob_tcp_peer_event_init(mca_oob_tcp_peer_t* peer) { - memset(&peer->peer_recv_event, 0, sizeof(peer->peer_recv_event)); - memset(&peer->peer_send_event, 0, sizeof(peer->peer_send_event)); - - if (peer->peer_sd >= 0) { - opal_event_set(opal_event_base, - &peer->peer_recv_event, + if (peer->peer_sd >= 0) { + opal_event_set(orte_event_base, + peer->peer_recv_event, peer->peer_sd, OPAL_EV_READ|OPAL_EV_PERSIST, mca_oob_tcp_peer_recv_handler, peer); - opal_event_set(opal_event_base, - &peer->peer_send_event, + opal_event_set_priority(peer->peer_recv_event, ORTE_MSG_PRI); + opal_event_set(orte_event_base, + peer->peer_send_event, peer->peer_sd, OPAL_EV_WRITE|OPAL_EV_PERSIST, mca_oob_tcp_peer_send_handler, peer); + opal_event_set_priority(peer->peer_send_event, ORTE_MSG_PRI); } return ORTE_SUCCESS; @@ -181,7 +182,7 @@ int mca_oob_tcp_peer_send(mca_oob_tcp_peer_t* peer, mca_oob_tcp_msg_t* msg) append to the peer_send_queue. */ OPAL_THREAD_UNLOCK(&peer->peer_lock); rc = mca_oob_tcp_resolve(peer); - if (ORTE_ERR_ADDRESSEE_UNKNOWN != OPAL_SOS_GET_ERROR_CODE(rc)) { + if (ORTE_ERR_ADDRESSEE_UNKNOWN != rc) { OPAL_THREAD_LOCK(&peer->peer_lock); opal_list_append(&peer->peer_send_queue, (opal_list_item_t*)msg); @@ -204,7 +205,7 @@ int mca_oob_tcp_peer_send(mca_oob_tcp_peer_t* peer, mca_oob_tcp_msg_t* msg) /*if the send does not complete */ if(!mca_oob_tcp_msg_send_handler(msg, peer)) { peer->peer_send_msg = msg; - opal_event_add(&peer->peer_send_event, 0); + opal_event_add(peer->peer_send_event, 0); } else { mca_oob_tcp_msg_complete(msg, &peer->peer_name); } @@ -399,7 +400,7 @@ static int mca_oob_tcp_peer_try_connect(mca_oob_tcp_peer_t* peer) rc = mca_oob_tcp_peer_create_socket(peer, inaddr.ss_family); if (ORTE_SUCCESS != rc) { struct timeval tv = { 1,0 }; - opal_event_evtimer_add(&peer->peer_timer_event, &tv); + opal_event_evtimer_add(peer->peer_timer_event, &tv); return rc; } @@ -414,7 +415,7 @@ static int mca_oob_tcp_peer_try_connect(mca_oob_tcp_peer_t* peer) if (connect(peer->peer_sd, (struct sockaddr*)&inaddr, addrlen) < 0) { /* non-blocking so wait for completion */ if(opal_socket_errno == EINPROGRESS || opal_socket_errno == EWOULDBLOCK) { - opal_event_add(&peer->peer_send_event, 0); + opal_event_add(peer->peer_send_event, 0); return ORTE_SUCCESS; } @@ -445,7 +446,7 @@ static int mca_oob_tcp_peer_try_connect(mca_oob_tcp_peer_t* peer) /* send our globally unique process identifier to the peer */ if((rc = mca_oob_tcp_peer_send_connect_ack(peer, peer->peer_sd)) == ORTE_SUCCESS) { peer->peer_state = MCA_OOB_TCP_CONNECT_ACK; - opal_event_add(&peer->peer_recv_event, 0); + opal_event_add(peer->peer_recv_event, 0); return ORTE_SUCCESS; } else { opal_output(0, @@ -505,8 +506,10 @@ static void mca_oob_tcp_peer_complete_connect(mca_oob_tcp_peer_t* peer, int sd) int so_error = 0; opal_socklen_t so_length = sizeof(so_error); - /* unregister from receiving event notifications */ - opal_event_del(&peer->peer_send_event); + /* unregister from receiving event notifications, + * but keep the event in case we need it later + */ + opal_event_del(peer->peer_send_event); /* check connect completion status */ if(getsockopt(sd, SOL_SOCKET, SO_ERROR, (char *)&so_error, &so_length) < 0) { @@ -520,7 +523,7 @@ static void mca_oob_tcp_peer_complete_connect(mca_oob_tcp_peer_t* peer, int sd) } if(so_error == EINPROGRESS) { - opal_event_add(&peer->peer_send_event, 0); + opal_event_add(peer->peer_send_event, 0); return; } else if (so_error == ECONNREFUSED || so_error == ETIMEDOUT) { struct timeval tv = { 1,0 }; @@ -534,7 +537,7 @@ static void mca_oob_tcp_peer_complete_connect(mca_oob_tcp_peer_t* peer, int sd) } mca_oob_tcp_peer_shutdown(peer); if( MCA_OOB_TCP_FAILED != peer->peer_state ) { - opal_event_evtimer_add(&peer->peer_timer_event, &tv); + opal_event_evtimer_add(peer->peer_timer_event, &tv); } return; } else if(so_error != 0) { @@ -554,7 +557,7 @@ static void mca_oob_tcp_peer_complete_connect(mca_oob_tcp_peer_t* peer, int sd) if (mca_oob_tcp_peer_send_connect_ack(peer, sd) == ORTE_SUCCESS) { peer->peer_state = MCA_OOB_TCP_CONNECT_ACK; - opal_event_add(&peer->peer_recv_event, 0); + opal_event_add(peer->peer_recv_event, 0); } else { opal_output(0, "%s-%s mca_oob_tcp_peer_complete_connect: unable to send connect ack.", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), @@ -569,7 +572,7 @@ static void mca_oob_tcp_peer_complete_connect(mca_oob_tcp_peer_t* peer, int sd) */ static void mca_oob_tcp_peer_connected(mca_oob_tcp_peer_t* peer, int sd) { - opal_event_del(&peer->peer_timer_event); + opal_event_del(peer->peer_timer_event); peer->peer_state = MCA_OOB_TCP_CONNECTED; peer->peer_retries = 0; @@ -578,7 +581,7 @@ static void mca_oob_tcp_peer_connected(mca_oob_tcp_peer_t* peer, int sd) peer->peer_send_msg = (mca_oob_tcp_msg_t*) opal_list_remove_first(&peer->peer_send_queue); } - opal_event_add(&peer->peer_send_event, 0); + opal_event_add(peer->peer_send_event, 0); } } @@ -618,20 +621,8 @@ void mca_oob_tcp_peer_close(mca_oob_tcp_peer_t* peer) /* inform the ERRMGR framework that we have lost a connection so * it can decide if this is important, what to do about it, etc. */ - if (ORTE_ERR_UNRECOVERABLE == orte_errmgr.update_state( - peer->peer_name.jobid, - ORTE_JOB_STATE_COMM_FAILED, - &peer->peer_name, - ORTE_PROC_STATE_COMM_FAILED, - 0, - ORTE_ERROR_DEFAULT_EXIT_CODE)) { - /* Should free the peer lock before we abort so we don't - * get stuck in the orte_wait_kill when receiving messages in the - * tcp OOB - */ - OPAL_THREAD_UNLOCK(&peer->peer_lock); - orte_errmgr.abort(ORTE_ERR_CONNECTION_FAILED, NULL); - } + ORTE_ACTIVATE_PROC_STATE(&peer->peer_name, + ORTE_PROC_STATE_COMM_FAILED); } void mca_oob_tcp_peer_shutdown(mca_oob_tcp_peer_t* peer) @@ -679,14 +670,16 @@ void mca_oob_tcp_peer_shutdown(mca_oob_tcp_peer_t* peer) close_socket: if (peer->peer_sd >= 0) { - opal_event_del(&peer->peer_recv_event); - opal_event_del(&peer->peer_send_event); + /* keep the events for re-use */ + opal_event_del(peer->peer_recv_event); + opal_event_del(peer->peer_send_event); CLOSE_THE_SOCKET(peer->peer_sd); peer->peer_sd = -1; peer->peer_current_af = AF_UNSPEC; } - - opal_event_del(&peer->peer_timer_event); + + /* keep the event for re-use */ + opal_event_del(peer->peer_timer_event); if( MCA_OOB_TCP_FAILED != peer->peer_state ) { peer->peer_state = MCA_OOB_TCP_CLOSED; } @@ -737,9 +730,9 @@ static int mca_oob_tcp_peer_recv_connect_ack(mca_oob_tcp_peer_t* peer, int sd) ORTE_NAME_PRINT(&(peer->peer_name)), strerror(opal_socket_errno)); } - opal_event_del(&peer->peer_recv_event); + opal_event_del(peer->peer_recv_event); mca_oob_tcp_peer_shutdown(peer); - opal_event_evtimer_add(&peer->peer_timer_event, &tv); + opal_event_evtimer_add(peer->peer_timer_event, &tv); return ORTE_SUCCESS; } else { mca_oob_tcp_peer_close(peer); @@ -996,7 +989,7 @@ static void mca_oob_tcp_peer_send_handler(int sd, short flags, void* user) /* if nothing else to do unregister for send event notifications */ if(NULL == peer->peer_send_msg) { - opal_event_del(&peer->peer_send_event); + opal_event_del(peer->peer_send_event); } break; } @@ -1005,7 +998,7 @@ static void mca_oob_tcp_peer_send_handler(int sd, short flags, void* user) ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(peer->peer_name)), peer->peer_state); - opal_event_del(&peer->peer_send_event); + opal_event_del(peer->peer_send_event); break; } OPAL_THREAD_UNLOCK(&peer->peer_lock); @@ -1113,7 +1106,7 @@ bool mca_oob_tcp_peer_accept(mca_oob_tcp_peer_t* peer, int sd) mca_oob_tcp_peer_connected(peer, sd); if (sd == peer->peer_sd) { - opal_event_add(&peer->peer_recv_event, 0); + opal_event_add(peer->peer_recv_event, 0); } if(mca_oob_tcp_component.tcp_debug > 0) { mca_oob_tcp_peer_dump(peer, "accepted"); diff --git a/orte/mca/oob/tcp/oob_tcp_peer.h b/orte/mca/oob/tcp/oob_tcp_peer.h index 68481dcd52..54c7fae06d 100644 --- a/orte/mca/oob/tcp/oob_tcp_peer.h +++ b/orte/mca/oob/tcp/oob_tcp_peer.h @@ -9,6 +9,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -64,9 +66,9 @@ struct mca_oob_tcp_peer_t { mca_oob_tcp_addr_t* peer_addr; /**< the addresses of the peer process */ int peer_sd; /**< socket descriptor of the connection */ uint16_t peer_current_af; /**< currently connecting af */ - opal_event_t peer_send_event; /**< registration with event thread for send events */ - opal_event_t peer_recv_event; /**< registration with event thread for recv events */ - opal_event_t peer_timer_event; /**< timer for retrying connection failures */ + opal_event_t *peer_send_event; /**< registration with event thread for send events */ + opal_event_t *peer_recv_event; /**< registration with event thread for recv events */ + opal_event_t *peer_timer_event; /**< timer for retrying connection failures */ opal_mutex_t peer_lock; /**< protect critical data structures */ opal_list_t peer_send_queue; /**< list of messages to send */ mca_oob_tcp_msg_t *peer_send_msg; /**< current send in progress */ diff --git a/orte/mca/oob/tcp/oob_tcp_ping.c b/orte/mca/oob/tcp/oob_tcp_ping.c index a48e853604..db568a0d46 100644 --- a/orte/mca/oob/tcp/oob_tcp_ping.c +++ b/orte/mca/oob/tcp/oob_tcp_ping.c @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2006-2010 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -87,7 +89,7 @@ mca_oob_tcp_ping(const orte_process_name_t* name, struct timeval tv; struct iovec iov; #ifndef __WINDOWS__ - opal_event_t sigpipe_handler; + opal_event_t *sigpipe_handler; #endif socklen_t addrlen; @@ -187,9 +189,10 @@ mca_oob_tcp_ping(const orte_process_name_t* name, #ifndef __WINDOWS__ /* Ignore SIGPIPE in the write -- determine success or failure in the ping by looking at the return code from write() */ - opal_event_signal_set(opal_event_base, &sigpipe_handler, SIGPIPE, - noop, &sigpipe_handler); - opal_event_signal_add(&sigpipe_handler, NULL); + sigpipe_handler = opal_event_alloc(); + opal_event_signal_set(orte_event_base, sigpipe_handler, SIGPIPE, + noop, sigpipe_handler); + opal_event_signal_add(sigpipe_handler, NULL); #endif /* Do the write and see what happens. Use the writev version just to * make Windows happy as there the write function is limitted to @@ -200,7 +203,7 @@ mca_oob_tcp_ping(const orte_process_name_t* name, rc = writev(sd, &iov, 1 ); #ifndef __WINDOWS__ /* Now de-register the handler */ - opal_event_signal_del(&sigpipe_handler); + opal_event_free(sigpipe_handler); #endif if (rc != sizeof(hdr)) { CLOSE_THE_SOCKET(sd); @@ -231,5 +234,8 @@ mca_oob_tcp_ping(const orte_process_name_t* name, static void noop(int fd, short event, void *arg) { - /* Nothing */ + opal_event_t *ev = (opal_event_t*)arg; + + /* return the event */ + opal_event_free(ev); } diff --git a/orte/mca/oob/tcp/oob_tcp_send.c b/orte/mca/oob/tcp/oob_tcp_send.c index d0722ddfd8..fe4175b83a 100644 --- a/orte/mca/oob/tcp/oob_tcp_send.c +++ b/orte/mca/oob/tcp/oob_tcp_send.c @@ -26,6 +26,29 @@ #include "orte/mca/oob/tcp/oob_tcp.h" +typedef struct { + opal_event_t *ev; + mca_oob_tcp_peer_t *peer; + mca_oob_tcp_msg_t *msg; +} orte_self_send_xfer_t; + +static void mca_oob_tcp_send_snd_exe(int fd, short args, void* data) +{ + orte_self_send_xfer_t *xfer = (orte_self_send_xfer_t*)data; + mca_oob_tcp_peer_t *peer = xfer->peer; + mca_oob_tcp_msg_t *msg = xfer->msg; + + /* release the event for re-use */ + opal_event_free(xfer->ev); + + /* + * Attempt to match against posted receive + */ + mca_oob_tcp_msg_recv_complete(msg, peer); + + free(xfer); +} + static int mca_oob_tcp_send_self( mca_oob_tcp_peer_t* peer, mca_oob_tcp_msg_t* msg, @@ -35,6 +58,7 @@ static int mca_oob_tcp_send_self( unsigned char *ptr; int size = 0; int rc; + orte_self_send_xfer_t *xfer; for(rc = 0; rc < count; rc++) { size += iov[rc].iov_len; @@ -70,10 +94,14 @@ static int mca_oob_tcp_send_self( } opal_mutex_unlock(&msg->msg_lock); - /* - * Attempt to match against posted receive - */ - mca_oob_tcp_msg_recv_complete(msg, peer); + xfer = (orte_self_send_xfer_t*)malloc(sizeof(orte_self_send_xfer_t)); + xfer->ev = opal_event_alloc(); + xfer->peer = peer; + xfer->msg = msg; + opal_event_set(orte_event_base, xfer->ev, -1, OPAL_EV_WRITE, mca_oob_tcp_send_snd_exe, xfer); + opal_event_set_priority(xfer->ev, ORTE_MSG_PRI); + opal_event_active(xfer->ev, OPAL_EV_WRITE, 1); + return size; } diff --git a/orte/mca/plm/alps/plm_alps_module.c b/orte/mca/plm/alps/plm_alps_module.c index 4886a0cd5b..07ada275d3 100644 --- a/orte/mca/plm/alps/plm_alps_module.c +++ b/orte/mca/plm/alps/plm_alps_module.c @@ -10,7 +10,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2006-2011 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2007-2011 Los Alamos National Security, LLC. All rights + * Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ * @@ -67,6 +67,7 @@ #include "orte/runtime/orte_wait.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/rmaps/rmaps.h" +#include "orte/mca/state/state.h" #include "orte/mca/plm/plm.h" #include "orte/mca/plm/base/base.h" @@ -107,6 +108,7 @@ orte_plm_base_module_t orte_plm_alps_module = { */ static pid_t alps_pid = 0; static bool failed_launch; +static void launch_daemons(int fd, short args, void *cbdata); /** @@ -118,6 +120,7 @@ static int plm_alps_init(void) if (ORTE_SUCCESS != (rc = orte_plm_base_comm_start())) { ORTE_ERROR_LOG(rc); + return rc; } if (orte_do_not_launch) { @@ -133,6 +136,13 @@ static int plm_alps_init(void) orte_plm_globals.daemon_nodes_assigned_at_launch = false; } + /* point to our launch command */ + if (ORTE_SUCCESS != (rc = orte_state.add_job_state(ORTE_JOB_STATE_LAUNCH_DAEMONS, + launch_daemons, ORTE_SYS_PRI))) { + ORTE_ERROR_LOG(rc); + return rc; + } + return rc; } @@ -142,6 +152,18 @@ static int plm_alps_init(void) * the job can cleanly terminate */ static int plm_alps_launch_job(orte_job_t *jdata) +{ + if (ORTE_JOB_CONTROL_RESTART & jdata->controls) { + /* this is a restart situation - skip to the mapping stage */ + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP); + } else { + /* new job - set it up */ + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_INIT); + } + return ORTE_SUCCESS; +} + +static void launch_daemons(int fd, short args, void *cbdata) { orte_job_map_t *map; char *jobid_string = NULL; @@ -158,60 +180,44 @@ static int plm_alps_launch_job(orte_job_t *jdata) char **custom_strings; int num_args, i; char *cur_prefix; - struct timeval joblaunchstart, launchstart, launchstop; int proc_vpid_index; orte_app_context_t *app; orte_node_t *node; orte_std_cntr_t nnode; - orte_jobid_t failed_job; - orte_job_state_t job_state = ORTE_JOB_STATE_NEVER_LAUNCHED; orte_job_t *daemons; + orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata; /* if we are launching debugger daemons, then just go * do it - no new daemons will be launched */ - if (ORTE_JOB_CONTROL_DEBUGGER_DAEMON & jdata->controls) { - failed_job = jdata->jobid; - if (ORTE_SUCCESS != (rc = orte_plm_base_launch_apps(jdata->jobid))) { - ORTE_ERROR_LOG(rc); - failed_launch = true; - } else { - failed_launch = false; - } - goto cleanup; + if (ORTE_JOB_CONTROL_DEBUGGER_DAEMON & state->jdata->controls) { + ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED); + OBJ_RELEASE(state); + return; } - /* default to declaring the daemon launch failed */ - failed_job = ORTE_PROC_MY_NAME->jobid; - - if (mca_plm_alps_component.timing) { - if (0 != gettimeofday(&joblaunchstart, NULL)) { - opal_output(0, "plm_alps: could not obtain job start time"); - } - } - - /* indicate the state of the launch */ - failed_launch = true; - /* start by setting up the virtual machine */ daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); - if (ORTE_SUCCESS != (rc = orte_plm_base_setup_virtual_machine(jdata))) { + if (ORTE_SUCCESS != (rc = orte_plm_base_setup_virtual_machine(state->jdata))) { ORTE_ERROR_LOG(rc); goto cleanup; } - /* if we don't want to launch, then don't attempt to + /* if we don't want to launch, then don't attempt to * launch the daemons - the user really wants to just * look at the proposed process map */ if (orte_do_not_launch) { - goto launch_apps; + /* set the state to indicate the daemons reported - this + * will trigger the daemons_reported event and cause the + * job to move to the following step + */ + state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; + ORTE_ACTIVATE_JOB_STATE(daemons, ORTE_JOB_STATE_DAEMONS_REPORTED); + OBJ_RELEASE(state); + return; } - OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:alps: launching vm", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - /* Get the map for this job */ if (NULL == (map = daemons->map)) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); @@ -220,11 +226,17 @@ static int plm_alps_launch_job(orte_job_t *jdata) } if (0 == map->num_new_daemons) { - /* have all the daemons we need - launch app */ + /* set the state to indicate the daemons reported - this + * will trigger the daemons_reported event and cause the + * job to move to the following step + */ OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "%s plm:alps: no new daemons to launch", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - goto launch_apps; + state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; + ORTE_ACTIVATE_JOB_STATE(daemons, ORTE_JOB_STATE_DAEMONS_REPORTED); + OBJ_RELEASE(state); + return; } /* need integer value for command line parameter */ @@ -344,9 +356,9 @@ static int plm_alps_launch_job(orte_job_t *jdata) don't support different --prefix'es for different nodes in the ALPS plm) */ cur_prefix = NULL; - for (i=0; i < jdata->apps->size; i++) { + for (i=0; i < state->jdata->apps->size; i++) { char *app_prefix_dir; - if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) { + if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(state->jdata->apps, i))) { continue; } app_prefix_dir = app->prefix_dir; @@ -357,7 +369,7 @@ static int plm_alps_launch_job(orte_job_t *jdata) 0 != strcmp (cur_prefix, app_prefix_dir)) { orte_show_help("help-plm-alps.txt", "multiple-prefixes", true, cur_prefix, app_prefix_dir); - return ORTE_ERR_FATAL; + goto cleanup; } /* If not yet set, copy it; iff set, then it's the @@ -375,66 +387,26 @@ static int plm_alps_launch_job(orte_job_t *jdata) /* setup environment */ env = opal_argv_copy(orte_launch_environ); - if (mca_plm_alps_component.timing) { - if (0 != gettimeofday(&launchstart, NULL)) { - opal_output(0, "plm_alps: could not obtain start time"); - } + if (0 < opal_output_get_verbosity(orte_plm_globals.output)) { + param = opal_argv_join(argv, ' '); + OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, + "%s plm:alps: final top-level argv:\n\t%s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (NULL == param) ? "NULL" : param)); + if (NULL != param) free(param); } - - /* set the job state to indicate we attempted to launch */ - job_state = ORTE_JOB_STATE_FAILED_TO_START; - + /* exec the daemon(s) */ if (ORTE_SUCCESS != (rc = plm_alps_start_proc(argc, argv, env, cur_prefix))) { ORTE_ERROR_LOG(rc); goto cleanup; } - /* do NOT wait for alps to complete. Alps only completes when the processes - * it starts - in this case, the orteds - complete. Instead, we'll catch - * any alps failures and deal with them elsewhere - */ - - /* wait for daemons to callback */ - if (ORTE_SUCCESS != (rc = orte_plm_base_daemon_callback(map->num_new_daemons))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - - launch_apps: - /* setup the job */ - if (ORTE_SUCCESS != (rc = orte_plm_base_setup_job(jdata))) { - ORTE_ERROR_LOG(rc); - failed_job = jdata->jobid; - goto cleanup; - } - failed_job = jdata->jobid; + /* indicate that the daemons for this job were launched */ + state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; - if (ORTE_SUCCESS != (rc = orte_plm_base_launch_apps(jdata->jobid))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - - /* declare the launch a success */ + /* flag that launch was successful, so far as we currently know */ failed_launch = false; - - if (mca_plm_alps_component.timing) { - if (0 != gettimeofday(&launchstop, NULL)) { - opal_output(0, "plm_alps: could not obtain stop time"); - } else { - opal_output(0, "plm_alps: daemon block launch time is %ld usec", - (launchstop.tv_sec - launchstart.tv_sec)*1000000 + - (launchstop.tv_usec - launchstart.tv_usec)); - opal_output(0, "plm_alps: total job launch time is %ld usec", - (launchstop.tv_sec - joblaunchstart.tv_sec)*1000000 + - (launchstop.tv_usec - joblaunchstart.tv_usec)); - } - } - - if (ORTE_SUCCESS != rc) { - opal_output(0, "plm:alps: start_procs returned error %d", rc); - goto cleanup; - } cleanup: if (NULL != argv) { @@ -448,20 +420,13 @@ static int plm_alps_launch_job(orte_job_t *jdata) free(jobid_string); } + /* cleanup the caddy */ + OBJ_RELEASE(state); + /* check for failed launch - if so, force terminate */ if (failed_launch) { - if (ORTE_ERR_SILENT == rc) { - orte_errmgr.update_state(failed_job, ORTE_JOB_STATE_SILENT_ABORT, - NULL, ORTE_PROC_STATE_UNDEF, - 0, ORTE_ERROR_DEFAULT_EXIT_CODE); - } else { - orte_errmgr.update_state(failed_job, job_state, - NULL, ORTE_PROC_STATE_UNDEF, - 0, ORTE_ERROR_DEFAULT_EXIT_CODE); - } + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); } - - return rc; } @@ -528,6 +493,8 @@ static int plm_alps_finalize(void) static void alps_wait_cb(pid_t pid, int status, void* cbdata){ + orte_job_t *jdata; + /* According to the ALPS folks, alps always returns the highest exit code of our remote processes. Thus, a non-zero exit status doesn't necessarily mean that alps failed - it could be that an orted returned @@ -543,6 +510,7 @@ static void alps_wait_cb(pid_t pid, int status, void* cbdata){ alps failed. Report the error and make sure that orterun wakes up - otherwise, do nothing! */ + jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); if (0 != status) { if (failed_launch) { @@ -556,15 +524,12 @@ static void alps_wait_cb(pid_t pid, int status, void* cbdata){ /* report that the daemon has failed so we break out of the daemon * callback receive and exit */ - orte_errmgr.update_state(ORTE_PROC_MY_NAME->jobid, ORTE_JOB_STATE_FAILED_TO_START, - NULL, ORTE_PROC_STATE_UNDEF, 0, status); - + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_FAILED_TO_START); } else { /* an orted must have died unexpectedly after launch - report * that the daemon has failed so we exit */ - orte_errmgr.update_state(ORTE_PROC_MY_NAME->jobid, ORTE_JOB_STATE_ABORTED, - NULL, ORTE_PROC_STATE_UNDEF, 0, status); + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ABORTED); } } diff --git a/orte/mca/plm/base/base.h b/orte/mca/plm/base/base.h index 8589ec00fa..6dfc351055 100644 --- a/orte/mca/plm/base/base.h +++ b/orte/mca/plm/base/base.h @@ -82,6 +82,14 @@ ORTE_DECLSPEC int orte_plm_base_close(void); ORTE_DECLSPEC void orte_plm_base_app_report_launch(int fd, short event, void *data); ORTE_DECLSPEC void orte_plm_base_receive_process_msg(int fd, short event, void *data); +ORTE_DECLSPEC void orte_plm_base_setup_job(int fd, short args, void *cbdata); +ORTE_DECLSPEC void orte_plm_base_complete_setup(int fd, short args, void *cbdata); +ORTE_DECLSPEC void orte_plm_base_daemons_reported(int fd, short args, void *cbdata); +ORTE_DECLSPEC void orte_plm_base_daemons_launched(int fd, short args, void *cbdata); +ORTE_DECLSPEC void orte_plm_base_launch_apps(int fd, short args, void *cbdata); +ORTE_DECLSPEC void orte_plm_base_post_launch(int fd, short args, void *cbdata); +ORTE_DECLSPEC void orte_plm_base_registered(int fd, short args, void *cbdata); + #endif /* ORTE_DISABLE_FULL_SUPPORT */ END_C_DECLS diff --git a/orte/mca/plm/base/plm_base_close.c b/orte/mca/plm/base/plm_base_close.c index 68337c5043..9734a6d57e 100644 --- a/orte/mca/plm/base/plm_base_close.c +++ b/orte/mca/plm/base/plm_base_close.c @@ -59,14 +59,6 @@ int orte_plm_base_close(void) orte_plm.finalize(); } - /* clearout the orted cmd locks */ - OBJ_DESTRUCT(&orte_plm_globals.orted_cmd_lock); - OBJ_DESTRUCT(&orte_plm_globals.orted_cmd_cond); - - /* clearout the spawn locks */ - OBJ_DESTRUCT(&orte_plm_globals.spawn_lock); - OBJ_DESTRUCT(&orte_plm_globals.spawn_cond); - /* Close all open components */ mca_base_components_close(orte_plm_globals.output, &orte_plm_base.available_components, NULL); diff --git a/orte/mca/plm/base/plm_base_jobid.c b/orte/mca/plm/base/plm_base_jobid.c index f688676c53..a4961dcfff 100644 --- a/orte/mca/plm/base/plm_base_jobid.c +++ b/orte/mca/plm/base/plm_base_jobid.c @@ -62,12 +62,10 @@ int orte_plm_base_set_hnp_name(void) /* set the name */ ORTE_PROC_MY_NAME->jobid = 0xffff0000 & ((uint32_t)jobfam << 16); ORTE_PROC_MY_NAME->vpid = 0; - ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,ORTE_EPOCH_MIN); /* copy it to the HNP field */ ORTE_PROC_MY_HNP->jobid = ORTE_PROC_MY_NAME->jobid; ORTE_PROC_MY_HNP->vpid = ORTE_PROC_MY_NAME->vpid; - ORTE_EPOCH_SET(ORTE_PROC_MY_HNP->epoch,ORTE_PROC_MY_NAME->epoch); /* done */ return ORTE_SUCCESS; @@ -99,7 +97,7 @@ int orte_plm_base_create_jobid(orte_job_t *jdata) } #endif - if (ORTE_JOB_STATE_RESTART == jdata->state) { + if (ORTE_JOB_CONTROL_RESTART & jdata->controls) { /* this job is being restarted - do not assign it * a new jobid */ diff --git a/orte/mca/plm/base/plm_base_launch_support.c b/orte/mca/plm/base/plm_base_launch_support.c index 0aeced0c76..884078dca2 100644 --- a/orte/mca/plm/base/plm_base_launch_support.c +++ b/orte/mca/plm/base/plm_base_launch_support.c @@ -12,8 +12,7 @@ * Copyright (c) 2007-2011 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2009 Institut National de Recherche en Informatique * et Automatique. All rights reserved. - * Copyright (c) 2011 Los Alamos National Security, LLC. - * All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. * $COPYRIGHT$ * * Additional copyrights may follow @@ -40,11 +39,12 @@ #include "opal/mca/base/mca_base_param.h" #include "opal/mca/hwloc/hwloc.h" +#include "orte/util/session_dir.h" #include "orte/util/show_help.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/ess/ess.h" #include "orte/mca/iof/iof.h" -#include "orte/mca/ras/ras.h" +#include "orte/mca/ras/base/base.h" #include "orte/mca/rmaps/rmaps.h" #include "orte/mca/rmaps/base/base.h" #include "orte/mca/rml/rml.h" @@ -57,7 +57,10 @@ #endif #include "orte/mca/filem/filem.h" #include "orte/mca/filem/base/base.h" +#include "orte/mca/grpcomm/base/base.h" +#include "orte/mca/notifier/notifier.h" #include "orte/mca/rml/base/rml_contact.h" +#include "orte/mca/sensor/sensor.h" #include "orte/runtime/orte_globals.h" #include "orte/runtime/runtime.h" #include "orte/runtime/orte_locks.h" @@ -66,106 +69,194 @@ #include "orte/util/nidmap.h" #include "orte/util/proc_info.h" #include "orte/util/regex.h" +#include "orte/mca/state/state.h" +#include "orte/mca/state/base/base.h" #include "orte/util/hostfile/hostfile.h" - #include "orte/mca/odls/odls_types.h" #include "orte/mca/plm/base/plm_private.h" #include "orte/mca/plm/base/base.h" -int orte_plm_base_setup_job(orte_job_t *jdata) +void orte_plm_base_daemons_reported(int fd, short args, void *cbdata) { - orte_app_context_t *app; - int rc; - int32_t ljob; + orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; int i; + orte_job_t *jdata; + #if OPAL_HAVE_HWLOC - orte_node_t *node; - hwloc_topology_t t0; + { + hwloc_topology_t t; + orte_node_t *node; + int i; + + /* if the user didn't indicate that the node topologies were + * different, then set the nodes to point to the topology + * of the first node. + * + * NOTE: We do -not- point the nodes at the topology of + * mpirun because many "homogeneous" clusters have a head + * node that differs from all the compute nodes! + */ + if (!orte_hetero_nodes) { + if (NULL == (t = (hwloc_topology_t)opal_pointer_array_get_item(orte_node_topologies, 1))) { + /* all collapsed down into mpirun's topology */ + t = (hwloc_topology_t)opal_pointer_array_get_item(orte_node_topologies, 0); + } + for (i=1; i < orte_node_pool->size; i++) { + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) { + continue; + } + node->topology = t; + } + } + } #endif - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:setup_job for job %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(jdata->jobid))); - - /* if the job is not being restarted and hasn't already been given a jobid, prep it */ - if (ORTE_JOB_STATE_RESTART != jdata->state && ORTE_JOBID_INVALID == jdata->jobid) { - /* get a jobid for it */ - if (ORTE_SUCCESS != (rc = orte_plm_base_create_jobid(jdata))) { - ORTE_ERROR_LOG(rc); - return rc; + /* progress all jobs whose daemons have launched */ + for (i=1; i < orte_job_data->size; i++) { + if (NULL == (jdata = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, i))) { + continue; + } + if (ORTE_JOB_STATE_DAEMONS_LAUNCHED == jdata->state) { + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP); } - - /* store it on the global job data pool */ - ljob = ORTE_LOCAL_JOBID(jdata->jobid); - opal_pointer_array_set_item(orte_job_data, ljob, jdata); } - /* set the job state */ - if (ORTE_JOB_STATE_RESTART != jdata->state) { - jdata->state = ORTE_JOB_STATE_INIT; + /* cleanup */ + OBJ_RELEASE(caddy); +} + +void orte_plm_base_daemons_launched(int fd, short args, void *cbdata) +{ + orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; + + /* do NOT increment the state - we wait for the + * daemons to report that they have actually + * started before moving to the right state + */ + /* cleanup */ + OBJ_RELEASE(caddy); +} + +void orte_plm_base_setup_job(int fd, short args, void *cbdata) +{ + int rc; + int i; + orte_app_context_t *app; + orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; + orte_grpcomm_coll_id_t modex, bar1, bar2; + char *modx_par, *modx_val; + char *bar1_par, *bar1_val; + char *bar2_par, *bar2_val; + + OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, + "%s plm:base:setup_job", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + + if (ORTE_JOB_STATE_INIT != caddy->job_state) { + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); + OBJ_RELEASE(caddy); + return; } - + /* update job state */ + caddy->jdata->state = caddy->job_state; + + /* start by getting a jobid */ + if (ORTE_SUCCESS != (rc = orte_plm_base_create_jobid(caddy->jdata))) { + ORTE_ERROR_LOG(rc); + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); + OBJ_RELEASE(caddy); + return; + } + + /* store it on the global job data pool - this is the key + * step required before we launch the daemons. It allows + * the orte_rmaps_base_setup_virtual_machine routine to + * search all apps for any hosts to be used by the vm + */ + opal_pointer_array_set_item(orte_job_data, ORTE_LOCAL_JOBID(caddy->jdata->jobid), caddy->jdata); + /* if job recovery is not defined, set it to default */ - if (!jdata->recovery_defined) { + if (!caddy->jdata->recovery_defined) { /* set to system default */ - jdata->enable_recovery = orte_enable_recovery; + caddy->jdata->enable_recovery = orte_enable_recovery; } + + /* get collective ids for the std MPI operations */ + modex = orte_grpcomm_base_get_coll_id(); + modx_par = mca_base_param_environ_variable("orte", NULL, "peer_modex_id"); + asprintf(&modx_val, "%d", modex); + bar1 = orte_grpcomm_base_get_coll_id(); + bar1_par = mca_base_param_environ_variable("orte", NULL, "peer_init_barrier_id"); + asprintf(&bar1_val, "%d", bar1); + bar2 = orte_grpcomm_base_get_coll_id(); + bar2_par = mca_base_param_environ_variable("orte", NULL, "peer_fini_barrier_id"); + asprintf(&bar2_val, "%d", bar2); + /* if app recovery is not defined, set apps to defaults */ - for (i=0; i < jdata->apps->size; i++) { - if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) { + for (i=0; i < caddy->jdata->apps->size; i++) { + if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(caddy->jdata->apps, i))) { continue; } if (!app->recovery_defined) { app->max_restarts = orte_max_restarts; } + /* set the envars for the collective ids */ + opal_setenv(modx_par, modx_val, true, &app->env); + opal_setenv(bar1_par, bar1_val, true, &app->env); + opal_setenv(bar2_par, bar2_val, true, &app->env); } - - /* get the allocation for this job */ - if (ORTE_SUCCESS != (rc = orte_ras.allocate(jdata))) { - ORTE_ERROR_LOG(rc); - return rc; - } + free(modx_par); + free(modx_val); + free(bar1_par); + free(bar1_val); + free(bar2_par); + free(bar2_val); -#if OPAL_HAVE_HWLOC - /* if we are not going to launch, then we need to set any - * undefined topologies to match our own so the mapper - * can operate - */ - if (orte_do_not_launch) { - node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0); - t0 = node->topology; - for (i=1; i < orte_node_pool->size; i++) { - if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) { - continue; - } - if (NULL == node->topology) { - node->topology = t0; - } - } - } -#endif + /* set the job state to the next position */ + ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_ALLOCATE); - /* map the job */ - if (ORTE_SUCCESS != (rc = orte_rmaps.map_job(jdata))) { - ORTE_ERROR_LOG(rc); - return rc; - } + /* cleanup */ + OBJ_RELEASE(caddy); +} - /* if we don't want to launch, now is the time to leave */ +void orte_plm_base_complete_setup(int fd, short args, void *cbdata) +{ + orte_job_t *jdata, *jdatorted; + orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; + + /* if we don't want to launch the apps, now is the time to leave */ if (orte_do_not_launch) { orte_never_launched = true; - ORTE_UPDATE_EXIT_STATUS(0); - orte_jobs_complete(); - return ORTE_ERR_SILENT; + ORTE_TERMINATE(0); + OBJ_RELEASE(caddy); + return; } - + + /* bozo check */ + if (ORTE_JOB_STATE_SYSTEM_PREP != caddy->job_state) { + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); + OBJ_RELEASE(caddy); + return; + } + /* update job state */ + caddy->jdata->state = caddy->job_state; + + /* get the orted job data object */ + if (NULL == (jdatorted = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); + OBJ_RELEASE(caddy); + return; + } + + /* convenience */ + jdata = caddy->jdata; + /* quick sanity check - is the stdin target within range * of the job? */ - if (jdata->jobid != ORTE_PROC_MY_NAME->jobid && - ORTE_VPID_WILDCARD != jdata->stdin_target && + if (ORTE_VPID_WILDCARD != jdata->stdin_target && ORTE_VPID_INVALID != jdata->stdin_target && jdata->num_procs <= jdata->stdin_target) { /* this request cannot be met */ @@ -173,10 +264,19 @@ int orte_plm_base_setup_job(orte_job_t *jdata) ORTE_VPID_PRINT(jdata->stdin_target), ORTE_VPID_PRINT(jdata->num_procs)); orte_never_launched = true; - ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); - orte_jobs_complete(); - return ORTE_ERROR; + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); + OBJ_RELEASE(caddy); + return; } + + orte_process_info.num_procs = jdatorted->num_procs; + + if (orte_process_info.max_procs < orte_process_info.num_procs) { + orte_process_info.max_procs = orte_process_info.num_procs; + } + + /* ensure our routing plan is up-to-date */ + orte_routed.update_routing_plan(); /*** RHC: USER REQUEST TO TIE-OFF STDXXX TO /DEV/NULL *** WILL BE SENT IN LAUNCH MESSAGE AS PART OF CONTROLS FIELD. @@ -196,63 +296,48 @@ int orte_plm_base_setup_job(orte_job_t *jdata) ORTE_ERROR_LOG(rc); } #endif - - return ORTE_SUCCESS; -} + /* set the job state to the next position */ + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_LAUNCH_APPS); -static struct timeval app_launch_start, app_launch_stop; -static opal_event_t *dmn_report_ev=NULL; -bool app_launch_failed; + /* cleanup */ + OBJ_RELEASE(caddy); +} /* catch timeout to allow cmds to progress */ static void timer_cb(int fd, short event, void *cbdata) { + orte_timer_t *tm = (orte_timer_t*)cbdata; + /* free event */ - if (NULL != dmn_report_ev) { - free(dmn_report_ev); - dmn_report_ev = NULL; - } - /* declare time is up */ - app_launch_failed = true; + OBJ_RELEASE(tm); + + /* declare launch failed */ + ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_FAILED_TO_START); } -int orte_plm_base_launch_apps(orte_jobid_t job) +void orte_plm_base_launch_apps(int fd, short args, void *cbdata) { orte_job_t *jdata; orte_daemon_cmd_flag_t command; opal_buffer_t *buffer; int rc; - orte_process_name_t name = {ORTE_JOBID_INVALID, 0}; + orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; - /* if we are launching the daemon job, then we are - * starting a virtual machine and there is no app - * to launch. Just flag the launch as complete - */ - if (ORTE_PROC_MY_NAME->jobid == job) { - rc = ORTE_SUCCESS; - goto WAKEUP; + /* convenience */ + jdata = caddy->jdata; + + if (ORTE_JOB_STATE_LAUNCH_APPS != caddy->job_state) { + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); + OBJ_RELEASE(caddy); + return; } - + /* update job state */ + caddy->jdata->state = caddy->job_state; + OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, "%s plm:base:launch_apps for job %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(job))); - - if (orte_timing) { - gettimeofday(&app_launch_start, NULL); - } - - if (ORTE_JOBID_INVALID == job) { - /* we are only launching debugger daemons */ - jdata = orte_debugger_daemon; - } else { - if (NULL == (jdata = orte_get_job_data_object(job))) { - /* bad jobid */ - ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); - rc = ORTE_ERR_BAD_PARAM; - goto WAKEUP; - } - } + ORTE_JOBID_PRINT(jdata->jobid))); /* setup the buffer */ buffer = OBJ_NEW(opal_buffer_t); @@ -262,18 +347,17 @@ int orte_plm_base_launch_apps(orte_jobid_t job) if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &command, 1, ORTE_DAEMON_CMD))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(buffer); - goto WAKEUP; + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); + OBJ_RELEASE(caddy); + return; } /* get the local launcher's required data */ - if (ORTE_SUCCESS != (rc = orte_odls.get_add_procs_data(buffer, job))) { + if (ORTE_SUCCESS != (rc = orte_odls.get_add_procs_data(buffer, jdata->jobid))) { ORTE_ERROR_LOG(rc); - goto WAKEUP; - } - - /* if we are timing, record the time we send this message */ - if (orte_timing) { - gettimeofday(&jdata->launch_msg_sent, NULL); + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); + OBJ_RELEASE(caddy); + return; } /* send the command to the daemons */ @@ -281,96 +365,156 @@ int orte_plm_base_launch_apps(orte_jobid_t job) buffer, ORTE_RML_TAG_DAEMON))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(buffer); - goto WAKEUP; + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); + OBJ_RELEASE(caddy); + return; } - OBJ_RELEASE(buffer); - + /* setup a timer - if we don't launch within the * defined time, then we know things have failed */ if (0 < orte_startup_timeout) { - ORTE_DETECT_TIMEOUT(&dmn_report_ev, orte_startup_timeout, 1000, 10000000, timer_cb); + ORTE_DETECT_TIMEOUT(orte_startup_timeout, 1000, 10000000, timer_cb, NULL); } - - /* wait for all the daemons to report apps launched */ - app_launch_failed = false; - ORTE_PROGRESSED_WAIT(app_launch_failed, jdata->num_launched, jdata->num_procs); - - if (ORTE_JOB_STATE_RUNNING != jdata->state) { - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:launch failed for job %s on error %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(job), ORTE_ERROR_NAME(rc))); - goto WAKEUP; + + /* cleanup */ + OBJ_RELEASE(caddy); +} + +void orte_plm_base_post_launch(int fd, short args, void *cbdata) +{ + int32_t rc; + orte_job_t *jdata; + orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; + orte_process_name_t name; + + /* convenience */ + jdata = caddy->jdata; + + if (ORTE_JOB_STATE_RUNNING != caddy->job_state) { + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); + OBJ_RELEASE(caddy); + return; } - - if (orte_timing) { - int64_t maxsec, maxusec; - char *tmpstr; - gettimeofday(&app_launch_stop, NULL); - /* subtract starting time to get time in microsecs for test */ - maxsec = app_launch_stop.tv_sec - app_launch_start.tv_sec; - maxusec = app_launch_stop.tv_usec - app_launch_start.tv_usec; - tmpstr = orte_pretty_print_timing(maxsec, maxusec); - fprintf(orte_timing_output, "Time to launch apps: %s\n", tmpstr); - free(tmpstr); - } - + /* update job state */ + caddy->jdata->state = caddy->job_state; + /* complete wiring up the iof */ OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:launch wiring up iof", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + "%s plm:base:launch wiring up iof for job %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(jdata->jobid))); /* push stdin - the IOF will know what to do with the specified target */ - name.jobid = job; + name.jobid = jdata->jobid; name.vpid = jdata->stdin_target; - ORTE_EPOCH_SET(name.epoch,orte_ess.proc_get_epoch(&name)); if (ORTE_SUCCESS != (rc = orte_iof.push(&name, ORTE_IOF_STDIN, 0))) { ORTE_ERROR_LOG(rc); - goto WAKEUP; + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); + OBJ_RELEASE(caddy); + return; } - - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:launch completed for job %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(job))); - -WAKEUP: - /* wakeup anyone waiting for this */ - orte_plm_globals.spawn_complete = true; - orte_plm_globals.spawn_status = rc; - opal_condition_broadcast(&orte_plm_globals.spawn_cond); - return rc; + /* complete debugger interface */ + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_READY_FOR_DEBUGGERS); + + /* cleanup */ + OBJ_RELEASE(caddy); +} + +void orte_plm_base_registered(int fd, short args, void *cbdata) +{ + int ret; + int32_t rc; + orte_job_t *jdata; + opal_buffer_t *answer; + orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; + + /* convenience */ + jdata = caddy->jdata; + + OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, + "%s plm:base:launch registered event", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + + if (ORTE_JOB_STATE_REGISTERED != caddy->job_state) { + OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, + "%s plm:base:launch job %s not registered - state %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(jdata->jobid), + orte_job_state_to_str(caddy->job_state))); + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); + OBJ_RELEASE(caddy); + return; + } + /* update job state */ + caddy->jdata->state = caddy->job_state; + + /* if this isn't a dynamic spawn, just cleanup */ + if (ORTE_JOBID_INVALID == jdata->originator.jobid) { + OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, + "%s plm:base:launch job %s is not a dynamic spawn", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(jdata->jobid))); + goto cleanup; + } + /* if it was a dynamic spawn, send the response */ + rc = ORTE_SUCCESS; + answer = OBJ_NEW(opal_buffer_t); + if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &rc, 1, OPAL_INT32))) { + ORTE_ERROR_LOG(ret); + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); + OBJ_RELEASE(caddy); + return; + } + if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &jdata->jobid, 1, ORTE_JOBID))) { + ORTE_ERROR_LOG(ret); + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); + OBJ_RELEASE(caddy); + return; + } + OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, + "%s plm:base:launch sending dyn release of job %s to %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(jdata->jobid), + ORTE_NAME_PRINT(&jdata->originator))); + if (0 > (ret = orte_rml.send_buffer_nb(&jdata->originator, answer, + ORTE_RML_TAG_PLM_PROXY, 0, + orte_rml_send_callback, NULL))) { + ORTE_ERROR_LOG(ret); + OBJ_RELEASE(answer); + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); + OBJ_RELEASE(caddy); + return; + } + + cleanup: + /* RHC: need to init_after_spawn for debuggers */ + /* no state to activate - this ends the launch sequence */ + OBJ_RELEASE(caddy); } /* daemons callback when they start - need to listen for them */ -static int orted_num_callback; static bool orted_failed_launch; -static orte_job_t *jdatorted; -static struct timeval daemonlaunchtime = {0,0}, daemonsetuptime = {0,0}, daemoncbtime = {0,0}; +static orte_job_t *jdatorted=NULL; -static void process_orted_launch_report(int fd, short event, void *data) +void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender, + opal_buffer_t *buffer, + orte_rml_tag_t tag, void *cbdata) { - orte_message_event_t *mev = (orte_message_event_t*)data; - opal_buffer_t *buffer = mev->buffer; orte_process_name_t peer; char *rml_uri = NULL, *ptr; int rc, idx; - struct timeval recvtime; - long secs, usecs; - int64_t setupsec, setupusec; - int64_t startsec, startusec; orte_proc_t *daemon=NULL; char *nodename; orte_node_t *node; - /* see if we need to timestamp this receipt */ - if (orte_timing) { - gettimeofday(&recvtime, NULL); + /* get the daemon job, if necessary */ + if (NULL == jdatorted) { + jdatorted = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); } - + /* unpack its contact info */ idx = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &rml_uri, &idx, OPAL_STRING))) { @@ -386,21 +530,13 @@ static void process_orted_launch_report(int fd, short event, void *data) goto CLEANUP; } - rc = orte_rml_base_parse_uris(rml_uri, &peer, NULL ); - if( ORTE_SUCCESS != rc ) { - ORTE_ERROR_LOG(rc); - orted_failed_launch = true; - goto CLEANUP; - } - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:orted_report_launch from daemon %s via %s", + "%s plm:base:orted_report_launch from daemon %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&peer), - ORTE_NAME_PRINT(&mev->sender))); + ORTE_NAME_PRINT(sender))); /* update state and record for this daemon contact info */ - if (NULL == (daemon = (orte_proc_t*)opal_pointer_array_get_item(jdatorted->procs, peer.vpid))) { + if (NULL == (daemon = (orte_proc_t*)opal_pointer_array_get_item(jdatorted->procs, sender->vpid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); orted_failed_launch = true; goto CLEANUP; @@ -408,74 +544,6 @@ static void process_orted_launch_report(int fd, short event, void *data) daemon->state = ORTE_PROC_STATE_RUNNING; daemon->rml_uri = rml_uri; - /* if we are doing a timing test, unload the start and setup times of the daemon */ - if (orte_timing) { - /* get the time stamp when the daemon first started */ - idx = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &startsec, &idx, OPAL_INT64))) { - ORTE_ERROR_LOG(rc); - orted_failed_launch = true; - goto CLEANUP; - } - idx = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &startusec, &idx, OPAL_INT64))) { - ORTE_ERROR_LOG(rc); - orted_failed_launch = true; - goto CLEANUP; - } - /* save the latest daemon to start */ - if (startsec > daemonlaunchtime.tv_sec) { - daemonlaunchtime.tv_sec = startsec; - daemonlaunchtime.tv_usec = startusec; - } else if (startsec == daemonlaunchtime.tv_sec && - startusec > daemonlaunchtime.tv_usec) { - daemonlaunchtime.tv_usec = startusec; - } - /* get the time required for the daemon to setup - locally computed by each daemon */ - idx = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &setupsec, &idx, OPAL_INT64))) { - ORTE_ERROR_LOG(rc); - orted_failed_launch = true; - goto CLEANUP; - } - idx = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &setupusec, &idx, OPAL_INT64))) { - ORTE_ERROR_LOG(rc); - orted_failed_launch = true; - goto CLEANUP; - } - /* save the longest */ - if (setupsec > daemonsetuptime.tv_sec) { - daemonsetuptime.tv_sec = setupsec; - daemonsetuptime.tv_usec = setupusec; - } else if (setupsec == daemonsetuptime.tv_sec && - setupusec > daemonsetuptime.tv_usec) { - daemonsetuptime.tv_usec = setupusec; - } - /* get the time stamp of when the daemon started to send this message to us */ - idx = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &setupsec, &idx, OPAL_INT64))) { - ORTE_ERROR_LOG(rc); - orted_failed_launch = true; - goto CLEANUP; - } - idx = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &setupusec, &idx, OPAL_INT64))) { - ORTE_ERROR_LOG(rc); - orted_failed_launch = true; - goto CLEANUP; - } - /* check the time for the callback to complete and save the longest */ - ORTE_COMPUTE_TIME_DIFF(secs, usecs, setupsec, setupusec, recvtime.tv_sec, recvtime.tv_usec); - if (secs > daemoncbtime.tv_sec) { - daemoncbtime.tv_sec = secs; - daemoncbtime.tv_usec = usecs; - } else if (secs == daemoncbtime.tv_sec && - usecs > daemoncbtime.tv_usec) { - daemoncbtime.tv_usec = usecs; - } - } - /* unpack the node name */ idx = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &nodename, &idx, OPAL_STRING))) { @@ -599,153 +667,32 @@ static void process_orted_launch_report(int fd, short event, void *data) } #endif - /* if a tree-launch is underway, send the cmd back */ - if (NULL != orte_tree_launch_cmd) { - orte_rml.send_buffer(&peer, orte_tree_launch_cmd, ORTE_RML_TAG_DAEMON, 0); - } - CLEANUP: - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:orted_report_launch %s for daemon %s (via %s) at contact %s", + "%s plm:base:orted_report_launch %s for daemon %s at contact %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), orted_failed_launch ? "failed" : "completed", - ORTE_NAME_PRINT(&peer), - ORTE_NAME_PRINT(&mev->sender), + ORTE_NAME_PRINT(sender), (NULL == daemon) ? "UNKNOWN" : daemon->rml_uri)); - /* release the message */ - OBJ_RELEASE(mev); - if (orted_failed_launch) { - orte_errmgr.update_state(ORTE_PROC_MY_NAME->jobid, - ORTE_JOB_STATE_SILENT_ABORT, - NULL, ORTE_PROC_STATE_FAILED_TO_START, - 0, ORTE_ERROR_DEFAULT_EXIT_CODE); + ORTE_ACTIVATE_JOB_STATE(jdatorted, ORTE_JOB_STATE_FAILED_TO_START); } else { - orted_num_callback++; - } -} - -static void orted_report_launch(int status, orte_process_name_t* sender, - opal_buffer_t *buffer, - orte_rml_tag_t tag, void *cbdata) -{ - int rc; - - /* don't process this right away - we need to get out of the recv before - * we process the message as it may ask us to do something that involves - * more messaging! Instead, setup an event so that the message gets processed - * as soon as we leave the recv. - * - * The macro makes a copy of the buffer, which we release when processed - the incoming - * buffer, however, is NOT released here, although its payload IS transferred - * to the message buffer for later processing - */ - ORTE_MESSAGE_EVENT(sender, buffer, tag, process_orted_launch_report); - - /* reissue the recv */ - rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ORTED_CALLBACK, - ORTE_RML_NON_PERSISTENT, orted_report_launch, NULL); - if (rc != ORTE_SUCCESS && rc != ORTE_ERR_NOT_IMPLEMENTED) { - ORTE_ERROR_LOG(rc); - orted_failed_launch = true; - } -} - - -int orte_plm_base_daemon_callback(orte_std_cntr_t num_daemons) -{ - int rc; - - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:daemon_callback", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - orted_num_callback = 0; - orted_failed_launch = false; - /* get the orted job data object */ - if (NULL == (jdatorted = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - return ORTE_ERR_NOT_FOUND; - } - - rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ORTED_CALLBACK, - ORTE_RML_NON_PERSISTENT, orted_report_launch, NULL); - if (rc != ORTE_SUCCESS && rc != ORTE_ERR_NOT_IMPLEMENTED) { - ORTE_ERROR_LOG(rc); - return rc; - } - - ORTE_PROGRESSED_WAIT(orted_failed_launch, orted_num_callback, num_daemons); - - /* cancel the lingering recv */ - orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ORTED_CALLBACK); - - if (orted_failed_launch) { - /* we will have already emitted an error log or show - * help, so exit quietly from here - */ - return ORTE_ERR_SILENT; - } - -#if OPAL_HAVE_HWLOC - { - hwloc_topology_t t; - orte_node_t *node; - int i; - - /* if the user didn't indicate that the node topologies were - * different, then set the nodes to point to the topology - * of the first node. - * - * NOTE: We do -not- point the nodes at the topology of - * mpirun because many "homogeneous" clusters have a head - * node that differs from all the compute nodes! - */ - if (!orte_hetero_nodes) { - if (NULL == (t = (hwloc_topology_t)opal_pointer_array_get_item(orte_node_topologies, 1))) { - /* all collapsed down into mpirun's topology */ - t = (hwloc_topology_t)opal_pointer_array_get_item(orte_node_topologies, 0); - } - for (i=1; i < orte_node_pool->size; i++) { - if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) { - continue; - } - node->topology = t; - } + jdatorted->num_reported++; + if (jdatorted->num_procs == jdatorted->num_reported) { + /* activate the daemons_reported state */ + ORTE_ACTIVATE_JOB_STATE(jdatorted, ORTE_JOB_STATE_DAEMONS_REPORTED); } } -#endif - /* if we are timing, output the results */ - if (orte_timing) { - int64_t sec, usec; - char *tmpstr; - ORTE_COMPUTE_TIME_DIFF(sec, usec, orte_plm_globals.daemonlaunchstart.tv_sec, - orte_plm_globals.daemonlaunchstart.tv_usec, - daemonlaunchtime.tv_sec, daemonlaunchtime.tv_usec); - tmpstr = orte_pretty_print_timing(sec, usec); - fprintf(orte_timing_output, "Daemon launch was completed in %s\n", tmpstr); - free(tmpstr); - tmpstr = orte_pretty_print_timing(daemonsetuptime.tv_sec, daemonsetuptime.tv_usec); - fprintf(orte_timing_output, "Daemon setup (from first exec statement to ready-for-commands) was completed in a maximum of %s\n", tmpstr); - free(tmpstr); - tmpstr = orte_pretty_print_timing(daemoncbtime.tv_sec, daemoncbtime.tv_usec); - fprintf(orte_timing_output, "Daemon callback message to HNP took a maximum time of %s to reach the HNP\n", tmpstr); - free(tmpstr); - } - - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:daemon_callback completed", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - /* if a tree-launch was underway, clear out the cmd */ + /* if a tree-launch is underway, send the cmd back */ if (NULL != orte_tree_launch_cmd) { - OBJ_RELEASE(orte_tree_launch_cmd); + OBJ_RETAIN(orte_tree_launch_cmd); + orte_rml.send_buffer_nb(sender, orte_tree_launch_cmd, + ORTE_RML_TAG_DAEMON, 0, + orte_rml_send_callback, NULL); } - - return ORTE_SUCCESS; + } int orte_plm_base_setup_orted_cmd(int *argc, char ***argv) @@ -1051,15 +998,6 @@ int orte_plm_base_setup_virtual_machine(orte_job_t *jdata) */ map->num_new_daemons = 0; - /* run the allocator on the application job - this allows us to - * pickup any host or hostfile arguments so we get the full - * array of nodes in our allocation - */ - if (ORTE_SUCCESS != (rc = orte_ras.allocate(jdata))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* construct a list of available nodes - don't need ours as * we already exist */ @@ -1191,8 +1129,6 @@ int orte_plm_base_setup_virtual_machine(orte_job_t *jdata) return ORTE_ERR_OUT_OF_RESOURCE; } proc->name.vpid = daemons->num_procs; /* take the next available vpid */ - ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID); - ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name)); OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, "%s plm:base:setup_vm add new daemon %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), @@ -1239,10 +1175,8 @@ int orte_plm_base_setup_virtual_machine(orte_job_t *jdata) orte_process_info.max_procs = orte_process_info.num_procs; } - if (ORTE_SUCCESS != (rc = orte_routed.update_routing_tree(ORTE_PROC_MY_NAME->jobid))) { - ORTE_ERROR_LOG(rc); - return rc; - } + /* ensure our routing plan is up-to-date */ + orte_routed.update_routing_plan(); } return ORTE_SUCCESS; diff --git a/orte/mca/plm/base/plm_base_open.c b/orte/mca/plm/base/plm_base_open.c index f14b5683d3..9fb37ffe2b 100644 --- a/orte/mca/plm/base/plm_base_open.c +++ b/orte/mca/plm/base/plm_base_open.c @@ -98,17 +98,6 @@ int orte_plm_base_open(void) /* init selected to be false */ orte_plm_base.selected = false; - /* initialize the condition variables for orted comm */ - OBJ_CONSTRUCT(&orte_plm_globals.orted_cmd_lock, opal_mutex_t); - OBJ_CONSTRUCT(&orte_plm_globals.orted_cmd_cond, opal_condition_t); - - /* initialize the condition variables for spawn */ - OBJ_CONSTRUCT(&orte_plm_globals.spawn_lock, opal_mutex_t); - OBJ_CONSTRUCT(&orte_plm_globals.spawn_cond, opal_condition_t); - OBJ_CONSTRUCT(&orte_plm_globals.spawn_in_progress_cond, opal_condition_t); - orte_plm_globals.spawn_complete = false; - orte_plm_globals.spawn_in_progress = false; - /* init the next jobid */ orte_plm_globals.next_jobid = 1; diff --git a/orte/mca/plm/base/plm_base_orted_cmds.c b/orte/mca/plm/base/plm_base_orted_cmds.c index fffb5e38df..830c20806a 100644 --- a/orte/mca/plm/base/plm_base_orted_cmds.c +++ b/orte/mca/plm/base/plm_base_orted_cmds.c @@ -9,6 +9,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -38,189 +40,67 @@ #include "orte/mca/rml/rml.h" #include "orte/mca/rml/rml_types.h" #include "orte/runtime/orte_globals.h" +#include "orte/runtime/orte_wait.h" #include "orte/util/name_fns.h" #include "orte/util/proc_info.h" +#include "orte/mca/state/state.h" #include "orte/runtime/orte_wait.h" #include "orte/orted/orted.h" #include "orte/mca/plm/base/base.h" #include "orte/mca/plm/base/plm_private.h" -static opal_event_t *ev=NULL; -static orte_vpid_t num_reported, num_being_sent; -static bool done_reporting; - -static void failed_send(int fd, short event, void *arg) +#if 0 +static void failed_cmd(int fd, short event, void *cbdata) { - /* we get called if the sends in an abnormal term - * don't get sent in time - set the done flag - * so we can return the error + orte_timer_t *tm = (orte_timer_t*)cbdata; + + /* we get called if an abnormal term + * don't complete in time - just force exit */ OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:orted_cmd command messages timed out with num_sent %ld", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (long)num_reported)); - done_reporting = true; -} - -static void send_callback(int status, - orte_process_name_t* peer, - opal_buffer_t* req, - orte_rml_tag_t tag, - void* cbdata) -{ - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:orted_cmd message to %s sent", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(peer))); - - num_reported++; - if (num_reported == num_being_sent) { - /* cancel the timer */ - if (NULL != ev) { - opal_event_evtimer_del(ev); - free(ev); - ev = NULL; - } - - /* mark as done */ - done_reporting = true; - - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:orted_cmd all messages sent", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - } + "%s plm:base:orted_cmd command timed out", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + OBJ_RELEASE(tm); +/* + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); +*/ } +#endif int orte_plm_base_orted_exit(orte_daemon_cmd_flag_t command) { int rc; - opal_buffer_t cmd; - orte_job_t *daemons; - orte_proc_t *proc; + opal_buffer_t *cmd; OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, "%s plm:base:orted_cmd sending orted_exit commands", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - OBJ_CONSTRUCT(&cmd, opal_buffer_t); - /* flag that orteds are being terminated */ orte_orteds_term_ordered = true; - /* get the job object for the daemons */ - if (NULL == (daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - return ORTE_ERR_NOT_FOUND; - } - - /* pack the command */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(&cmd, &command, 1, ORTE_DAEMON_CMD))) { + /* we are not abnormally terminating - send it express delivery! */ + cmd = OBJ_NEW(opal_buffer_t); + /* pack the command */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(cmd, &command, 1, ORTE_DAEMON_CMD))) { ORTE_ERROR_LOG(rc); - OBJ_DESTRUCT(&cmd); + OBJ_RELEASE(cmd); return rc; } - - /* if we are abnormally ordering the termination, then - * we do -not- want to use a collective operation to send the - * command out as some of the daemons may not be alive and thus - * any daemon beyond that in the collective wouldn't get the - * command - use an alternative approach - */ - if (orte_abnormal_term_ordered) { - orte_vpid_t v; - orte_process_name_t peer; - - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:orted_cmd:orted_exit abnormal term ordered", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - /* turn off message routing - no way to guarantee that - * the route still exists - */ - orte_routing_is_enabled = false; - - /* now send the command one daemon at a time using a non-blocking - * send - let the callback function keep track of how many - * complete - it will delete the event if they all do. - * Start with vpid=1 as the HNP is told to exit another way - */ - done_reporting = false; - num_reported = 0; - num_being_sent = daemons->num_procs-1; - peer.jobid = ORTE_PROC_MY_NAME->jobid; - for(v=1; v < daemons->num_procs; v++) { - if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(daemons->procs, v))) { - continue; - } - /* if we don't have contact info for this daemon, - * then we know we can't reach it - so don't try - */ - if (NULL == proc->rml_uri || proc->state > ORTE_PROC_STATE_UNTERMINATED) { - --num_being_sent; - /* maintain accounting so orterun will exit */ - daemons->num_terminated++; - continue; - } - peer.vpid = v; - ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer)); - - /* don't worry about errors on the send here - just - * issue it and keep going - */ - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:orted_cmd:orted_exit sending cmd to %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&peer))); - if (0 > (rc = orte_rml.send_buffer_nb(&peer, &cmd, ORTE_RML_TAG_DAEMON, 0, - send_callback, 0))) { - ORTE_ERROR_LOG(rc); - --num_being_sent; - /* maintain accounting so orterun will exit */ - daemons->num_terminated++; - } - } - - /* since we cannot know which daemons may/may not be alive, - * setup an event so we will time out after giving the send - * our best attempt - */ - ORTE_DETECT_TIMEOUT(&ev, num_being_sent, - orte_timeout_usec_per_proc, - orte_max_timeout, failed_send); - - /* wait for completion or timeout */ - ORTE_PROGRESSED_WAIT(done_reporting, num_reported, num_being_sent); - - /* cleanup the timer */ - if (NULL != ev) { - opal_event_del(ev); - free(ev); - ev = NULL; - } - - /* be sure I get the command */ - ORTE_MESSAGE_EVENT(ORTE_PROC_MY_NAME, &cmd, ORTE_RML_TAG_DAEMON, orte_daemon_cmd_processor); - - /* if all the sends didn't go, or we couldn't send to - * all daemons, then report that */ - if (num_reported < num_being_sent || - num_being_sent < (daemons->num_procs-1)) { - OBJ_DESTRUCT(&cmd); - return ORTE_ERR_SILENT; - } - - /* if all sends went out, return success */ - OBJ_DESTRUCT(&cmd); - return ORTE_SUCCESS; - } - - /* we are not abnormally terminating - send it express delivery! */ - if (ORTE_SUCCESS != (rc = orte_grpcomm.xcast(ORTE_PROC_MY_NAME->jobid, &cmd, ORTE_RML_TAG_DAEMON))) { + if (ORTE_SUCCESS != (rc = orte_grpcomm.xcast(ORTE_PROC_MY_NAME->jobid, cmd, ORTE_RML_TAG_DAEMON))) { ORTE_ERROR_LOG(rc); } - OBJ_DESTRUCT(&cmd); + OBJ_RELEASE(cmd); +#if 0 + /* if we are abnormally ordering the termination, then + * set a timeout in case it never finishes + */ + if (orte_abnormal_term_ordered) { + ORTE_DETECT_TIMEOUT(orte_process_info.num_procs, 100, 3, failed_cmd, NULL); + } +#endif return rc; } @@ -241,7 +121,6 @@ int orte_plm_base_orted_terminate_job(orte_jobid_t jobid) OBJ_CONSTRUCT(&proc, orte_proc_t); proc.name.jobid = jobid; proc.name.vpid = ORTE_VPID_WILDCARD; - ORTE_EPOCH_SET(proc.name.epoch,ORTE_EPOCH_WILDCARD); opal_pointer_array_add(&procs, &proc); if (ORTE_SUCCESS != (rc = orte_plm_base_orted_kill_local_procs(&procs))) { ORTE_ERROR_LOG(rc); @@ -254,23 +133,20 @@ int orte_plm_base_orted_terminate_job(orte_jobid_t jobid) int orte_plm_base_orted_kill_local_procs(opal_pointer_array_t *procs) { int rc; - opal_buffer_t cmd; + opal_buffer_t *cmd; orte_daemon_cmd_flag_t command=ORTE_DAEMON_KILL_LOCAL_PROCS; int v; - orte_process_name_t peer; - orte_job_t *daemons; orte_proc_t *proc; OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, "%s plm:base:orted_cmd sending kill_local_procs cmds", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - OBJ_CONSTRUCT(&cmd, opal_buffer_t); - + cmd = OBJ_NEW(opal_buffer_t); /* pack the command */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(&cmd, &command, 1, ORTE_DAEMON_CMD))) { + if (ORTE_SUCCESS != (rc = opal_dss.pack(cmd, &command, 1, ORTE_DAEMON_CMD))) { ORTE_ERROR_LOG(rc); - OBJ_DESTRUCT(&cmd); + OBJ_RELEASE(cmd); return rc; } @@ -280,125 +156,17 @@ int orte_plm_base_orted_kill_local_procs(opal_pointer_array_t *procs) if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(procs, v))) { continue; } - if (ORTE_SUCCESS != (rc = opal_dss.pack(&cmd, &(proc->name), 1, ORTE_NAME))) { + if (ORTE_SUCCESS != (rc = opal_dss.pack(cmd, &(proc->name), 1, ORTE_NAME))) { ORTE_ERROR_LOG(rc); - OBJ_DESTRUCT(&cmd); + OBJ_RELEASE(cmd); return rc; } } } - - /* if we are abnormally ordering the termination, then - * we do -not- want to use a collective operation to send the - * command out as some of the daemons may not be alive and thus - * any daemon beyond that in the collective wouldn't get the - * command - use an alternative approach - */ - if (orte_abnormal_term_ordered) { - - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:orted_cmd:kill_local_procs abnormal term ordered", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - /* get the job object for the daemons */ - if (NULL == (daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - return ORTE_ERR_NOT_FOUND; - } - - /* if I am the HNP, I need to get this message too, but just set things - * up so the cmd processor gets called. - * We don't want to message ourselves as this can create circular logic - * in the RML. Instead, this macro will set a zero-time event which will - * cause the buffer to be processed by the cmd processor - probably will - * fire right away, but that's okay - * The macro makes a copy of the buffer, so it's okay to release it here - */ - if (ORTE_PROC_IS_HNP) { - ORTE_MESSAGE_EVENT(ORTE_PROC_MY_NAME, &cmd, ORTE_RML_TAG_DAEMON, orte_daemon_cmd_processor); - } - - /* now send the command one daemon at a time using a non-blocking - * send - let the callback function keep track of how many - * complete - it will delete the event if they all do. - * Start with vpid=1 as the HNP gets it another way - */ - done_reporting = false; - num_reported = 0; - num_being_sent = daemons->num_procs-1; - peer.jobid = ORTE_PROC_MY_NAME->jobid; - for(v=1; v < daemons->procs->size; v++) { - if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(daemons->procs, v))) { - continue; - } - /* if we don't have contact info for this daemon, - * then we know we can't reach it - so don't try - */ - if (NULL == proc->rml_uri) { - --num_being_sent; - continue; - } - peer.vpid = v; - ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer)); - /* check to see if this daemon is known to be "dead" */ - if (proc->state > ORTE_PROC_STATE_UNTERMINATED) { - /* don't try to send this */ - --num_being_sent; - continue; - } - /* don't worry about errors on the send here - just - * issue it and keep going - */ - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:orted_cmd:kill_local_procs sending cmd to %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&peer))); - if (0 > (rc = orte_rml.send_buffer_nb(&peer, &cmd, ORTE_RML_TAG_DAEMON, 0, - send_callback, 0))) { - ORTE_ERROR_LOG(rc); - --num_being_sent; - } - } - OBJ_DESTRUCT(&cmd); /* done with this */ - - /* since we cannot know which daemons may/may not be alive, - * setup an event so we will time out after giving the send - * our best attempt - */ - ORTE_DETECT_TIMEOUT(&ev, num_being_sent, - orte_timeout_usec_per_proc, - orte_max_timeout, failed_send); - - /* wait for completion or timeout */ - ORTE_PROGRESSED_WAIT(done_reporting, num_reported, num_being_sent); - - /* cleanup the timer */ - if (NULL != ev) { - opal_event_del(ev); - free(ev); - ev = NULL; - } - - /* if all the sends didn't go, or we couldn't send to - * all daemons, then report that */ - if (num_reported < num_being_sent || - num_being_sent < (daemons->num_procs-1)) { - return ORTE_ERR_SILENT; - } - - /* if all sends went out, return success */ - return ORTE_SUCCESS; - } - - /* we are not abnormally terminating - send it express delivery! */ - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:orted_cmd:kill_local_procs term ordered", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - if (ORTE_SUCCESS != (rc = orte_grpcomm.xcast(ORTE_PROC_MY_NAME->jobid, &cmd, ORTE_RML_TAG_DAEMON))) { + if (ORTE_SUCCESS != (rc = orte_grpcomm.xcast(ORTE_PROC_MY_NAME->jobid, cmd, ORTE_RML_TAG_DAEMON))) { ORTE_ERROR_LOG(rc); } - OBJ_DESTRUCT(&cmd); + OBJ_RELEASE(cmd); /* we're done! */ return rc; diff --git a/orte/mca/plm/base/plm_base_proxy.c b/orte/mca/plm/base/plm_base_proxy.c index 19962dfb04..cb1a99e6dc 100644 --- a/orte/mca/plm/base/plm_base_proxy.c +++ b/orte/mca/plm/base/plm_base_proxy.c @@ -39,73 +39,78 @@ int orte_plm_proxy_init(void) int orte_plm_proxy_spawn(orte_job_t *jdata) { - opal_buffer_t buf; + opal_buffer_t *buf; orte_plm_cmd_flag_t command; orte_std_cntr_t count; int rc; + int32_t retval; OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, "%s plm:base:proxy spawn child job", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* setup the buffer */ - OBJ_CONSTRUCT(&buf, opal_buffer_t); + buf = OBJ_NEW(opal_buffer_t); /* tell the recipient we are sending a launch request */ command = ORTE_PLM_LAUNCH_JOB_CMD; - if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &command, 1, ORTE_PLM_CMD))) { + if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &command, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } /* pack the jdata object */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &jdata, 1, ORTE_JOB))) { + if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &jdata, 1, ORTE_JOB))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:proxy sending spawn cmd to %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(ORTE_PROC_MY_HNP))); - - /* tell the target to launch the job */ - if (0 > (rc = orte_rml.send_buffer(ORTE_PROC_MY_HNP, &buf, ORTE_RML_TAG_PLM, 0))) { + /* tell the HNP to launch the job */ + if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buf, + ORTE_RML_TAG_PLM, 0, + orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); + OBJ_RELEASE(buf); goto CLEANUP; } - OBJ_DESTRUCT(&buf); OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, "%s plm:base:proxy waiting for response", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - /* wait for the target's response */ - OBJ_CONSTRUCT(&buf, opal_buffer_t); - if (0 > (rc = orte_rml.recv_buffer(ORTE_NAME_WILDCARD, &buf, ORTE_RML_TAG_PLM_PROXY, 0))) { + /* wait for the HNP's response */ + buf = OBJ_NEW(opal_buffer_t); + if (0 > (rc = orte_rml.recv_buffer(ORTE_NAME_WILDCARD, buf, ORTE_RML_TAG_PLM_PROXY, 0))) { ORTE_ERROR_LOG(rc); + OBJ_RELEASE(buf); + goto CLEANUP; + } + + /* get the returned status code for the launch request */ + count = 1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buf, &retval, &count, OPAL_INT32))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(buf); goto CLEANUP; } /* get the new jobid back in case the caller wants it */ count = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &(jdata->jobid), &count, ORTE_JOBID))) { + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buf, &(jdata->jobid), &count, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); + OBJ_RELEASE(buf); goto CLEANUP; } - if (ORTE_JOBID_INVALID == jdata->jobid) { - /* something went wrong on far end - go no further */ + OBJ_RELEASE(buf); + + if (ORTE_SUCCESS != retval || ORTE_JOBID_INVALID == jdata->jobid) { + /* something went wrong on far end */ rc = ORTE_ERR_FAILED_TO_START; - goto CLEANUP; } - /* good to go! */ - -CLEANUP: - OBJ_DESTRUCT(&buf); - +CLEANUP: return rc; } diff --git a/orte/mca/plm/base/plm_base_receive.c b/orte/mca/plm/base/plm_base_receive.c index a8940e0b09..ae39fec9c7 100644 --- a/orte/mca/plm/base/plm_base_receive.c +++ b/orte/mca/plm/base/plm_base_receive.c @@ -10,6 +10,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -48,6 +50,7 @@ #include "orte/mca/routed/routed.h" #include "orte/mca/ras/base/base.h" #include "orte/util/name_fns.h" +#include "orte/mca/state/state.h" #include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_quit.h" @@ -57,14 +60,6 @@ #include "orte/mca/plm/base/base.h" static bool recv_issued=false; -static opal_mutex_t lock; -static opal_condition_t cond; -static opal_list_t recvs; -static opal_event_t ready; -static int ready_fd[2]; -static bool processing; - -static void process_msg(int fd, short event, void *data); int orte_plm_base_comm_start(void) { @@ -78,29 +73,21 @@ int orte_plm_base_comm_start(void) "%s plm:base:receive start comm", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - processing = false; - OBJ_CONSTRUCT(&lock, opal_mutex_t); - OBJ_CONSTRUCT(&cond, opal_condition_t); - OBJ_CONSTRUCT(&recvs, opal_list_t); -#ifndef __WINDOWS__ - pipe(ready_fd); -#else - if (create_socketpair(AF_UNIX, SOCK_STREAM, 0, ready_fd) == -1) { - return ORTE_ERROR; - } -#endif - - memset(&ready, 0, sizeof(opal_event_t)); - opal_event_set(opal_event_base, &ready, ready_fd[0], OPAL_EV_READ, process_msg, NULL); - opal_event_add(&ready, 0); - if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_PLM, - ORTE_RML_NON_PERSISTENT, + ORTE_RML_PERSISTENT, orte_plm_base_recv, NULL))) { ORTE_ERROR_LOG(rc); } + if (ORTE_PROC_IS_HNP) { + if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, + ORTE_RML_TAG_ORTED_CALLBACK, + ORTE_RML_PERSISTENT, + orte_plm_base_daemon_callback, NULL))) { + ORTE_ERROR_LOG(rc); + } + } recv_issued = true; return rc; @@ -113,21 +100,14 @@ int orte_plm_base_comm_stop(void) return ORTE_SUCCESS; } - OBJ_DESTRUCT(&recvs); - opal_event_del(&ready); -#ifndef __WINDOWS__ - close(ready_fd[0]); -#else - closesocket(ready_fd[0]); -#endif - processing = false; - OBJ_DESTRUCT(&lock); - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, "%s plm:base:receive stop comm", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_PLM); + if (ORTE_PROC_IS_HNP) { + orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ORTED_CALLBACK); + } recv_issued = false; return ORTE_SUCCESS; @@ -135,392 +115,257 @@ int orte_plm_base_comm_stop(void) /* process incoming messages in order of receipt */ -static void process_msg(int fd, short event, void *data) +void orte_plm_base_recv(int status, orte_process_name_t* sender, + opal_buffer_t* buffer, orte_rml_tag_t tag, + void* cbdata) { - orte_msg_packet_t *msgpkt; orte_plm_cmd_flag_t command; orte_std_cntr_t count; orte_jobid_t job; orte_job_t *jdata, *parent; - opal_buffer_t answer; + opal_buffer_t *answer; orte_vpid_t vpid; -#if ORTE_ENABLE_EPOCH - orte_epoch_t epoch; -#endif orte_proc_t *proc; orte_proc_state_t state; orte_exit_code_t exit_code; - int rc=ORTE_SUCCESS, ret; + int32_t rc=ORTE_SUCCESS, ret; orte_app_context_t *app, *child_app; - opal_list_item_t *item; - int dump[128]; orte_process_name_t name; pid_t pid; bool running; - - OPAL_ACQUIRE_THREAD(&lock, &cond, &processing); - + OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, "%s plm:base:receive processing msg", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - /* clear the file descriptor to stop the event from refiring */ -#ifndef __WINDOWS__ - read(fd, &dump, sizeof(dump)); -#else - recv(fd, (char *) &dump, sizeof(dump), 0); -#endif - - /* reset the event for the next message */ - opal_event_add(&ready, 0); - - while (NULL != (item = opal_list_remove_first(&recvs))) { - msgpkt = (orte_msg_packet_t*)item; - - /* setup a default response */ - OBJ_CONSTRUCT(&answer, opal_buffer_t); - job = ORTE_JOBID_INVALID; + count = 1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &command, &count, ORTE_PLM_CMD))) { + ORTE_ERROR_LOG(rc); + goto CLEANUP; + } + switch (command) { + case ORTE_PLM_LAUNCH_JOB_CMD: + OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, + "%s plm:base:receive job launch command from %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(sender))); + + /* unpack the job object */ count = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(msgpkt->buffer, &command, &count, ORTE_PLM_CMD))) { + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &jdata, &count, ORTE_JOB))) { ORTE_ERROR_LOG(rc); - goto CLEANUP; + goto ANSWER_LAUNCH; } - - switch (command) { - case ORTE_PLM_LAUNCH_JOB_CMD: - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:receive job launch command", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - /* unpack the job object */ - count = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(msgpkt->buffer, &jdata, &count, ORTE_JOB))) { - ORTE_ERROR_LOG(rc); - goto ANSWER_LAUNCH; - } - /* flag that this is a dynamic spawn */ - jdata->dyn_spawn_active = true; + /* record the sender so we know who to respond to */ + jdata->originator.jobid = sender->jobid; + jdata->originator.vpid = sender->vpid; - /* get the parent's job object */ - if (NULL == (parent = orte_get_job_data_object(msgpkt->sender.jobid))) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - goto ANSWER_LAUNCH; - } + /* get the parent's job object */ + if (NULL == (parent = orte_get_job_data_object(sender->jobid))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + goto ANSWER_LAUNCH; + } + /* if the prefix was set in the parent's job, we need to transfer + * that prefix to the child's app_context so any further launch of + * orteds can find the correct binary. There always has to be at + * least one app_context in both parent and child, so we don't + * need to check that here. However, be sure not to overwrite + * the prefix if the user already provided it! + */ + app = (orte_app_context_t*)opal_pointer_array_get_item(parent->apps, 0); + child_app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, 0); + if (NULL != app->prefix_dir && + NULL == child_app->prefix_dir) { + child_app->prefix_dir = strdup(app->prefix_dir); + } - /* if the prefix was set in the parent's job, we need to transfer - * that prefix to the child's app_context so any further launch of - * orteds can find the correct binary. There always has to be at - * least one app_context in both parent and child, so we don't - * need to check that here. However, be sure not to overwrite - * the prefix if the user already provided it! - */ - app = (orte_app_context_t*)opal_pointer_array_get_item(parent->apps, 0); - child_app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, 0); - if (NULL != app->prefix_dir && - NULL == child_app->prefix_dir) { - child_app->prefix_dir = strdup(app->prefix_dir); - } - - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:receive adding hosts", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - /* process any add-hostfile and add-host options that were provided */ - if (ORTE_SUCCESS != (rc = orte_ras_base_add_hosts(jdata))) { - ORTE_ERROR_LOG(rc); - goto ANSWER_LAUNCH; - } - - if( NULL == parent->bookmark ) { - /* find the sender's node in the job map */ - if (NULL != (proc = (orte_proc_t*)opal_pointer_array_get_item(parent->procs, msgpkt->sender.vpid))) { - /* set the bookmark so the child starts from that place - this means - * that the first child process could be co-located with the proc - * that called comm_spawn, assuming slots remain on that node. Otherwise, - * the procs will start on the next available node - */ - jdata->bookmark = proc->node; - } - } else { - jdata->bookmark = parent->bookmark; - } - - /* launch it */ - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:receive calling spawn", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - OPAL_RELEASE_THREAD(&lock, &cond, &processing); - if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdata))) { - ORTE_ERROR_LOG(rc); - OPAL_ACQUIRE_THREAD(&lock, &cond, &processing); - goto DEPART; - } - OPAL_ACQUIRE_THREAD(&lock, &cond, &processing); + OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, + "%s plm:base:receive adding hosts", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - job = jdata->jobid; - - /* output debugger proctable, if requested */ - if (orte_debugger_dump_proctable && !jdata->map->display_map) { - char *output; - opal_dss.print(&output, NULL, jdata->map, ORTE_JOB_MAP); - if (orte_xml_output) { - fprintf(orte_xml_fp, "%s\n", output); - fflush(orte_xml_fp); - } else { - opal_output(orte_clean_output, "%s", output); - } - free(output); - } + /* process any add-hostfile and add-host options that were provided */ + if (ORTE_SUCCESS != (rc = orte_ras_base_add_hosts(jdata))) { + ORTE_ERROR_LOG(rc); + goto ANSWER_LAUNCH; + } - /* return the favor so that any repetitive comm_spawns track each other */ - parent->bookmark = jdata->bookmark; - - /* if the child is an ORTE job, wait for the procs to report they are alive */ - if (!(jdata->controls & ORTE_JOB_CONTROL_NON_ORTE_JOB)) { - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:receive waiting for procs to report", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - OPAL_RELEASE_THREAD(&lock, &cond, &processing); - /* we will wait here until the thread is released, - * indicating that all procs have reported + if( NULL == parent->bookmark ) { + /* find the sender's node in the job map */ + if (NULL != (proc = (orte_proc_t*)opal_pointer_array_get_item(parent->procs, sender->vpid))) { + /* set the bookmark so the child starts from that place - this means + * that the first child process could be co-located with the proc + * that called comm_spawn, assuming slots remain on that node. Otherwise, + * the procs will start on the next available node */ - OPAL_ACQUIRE_THREAD(&jdata->dyn_spawn_lock, - &jdata->dyn_spawn_cond, - &jdata->dyn_spawn_active); - OPAL_THREAD_UNLOCK(&jdata->dyn_spawn_lock); - OPAL_ACQUIRE_THREAD(&lock, &cond, &processing); + jdata->bookmark = proc->node; } + } else { + jdata->bookmark = parent->bookmark; + } + + /* launch it */ + OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, + "%s plm:base:receive calling spawn", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdata))) { + ORTE_ERROR_LOG(rc); + goto ANSWER_LAUNCH; + } + break; + ANSWER_LAUNCH: + OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, + "%s plm:base:receive - error on launch: %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), rc)); + + /* setup the response */ + answer = OBJ_NEW(opal_buffer_t); + + /* pack the error code to be returned */ + if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &rc, 1, OPAL_INT32))) { + ORTE_ERROR_LOG(ret); + } - ANSWER_LAUNCH: + /* send the response back to the sender */ + if (0 > (ret = orte_rml.send_buffer_nb(sender, answer, ORTE_RML_TAG_PLM_PROXY, 0, + orte_rml_send_callback, NULL))) { + ORTE_ERROR_LOG(ret); + OBJ_RELEASE(answer); + } + break; + + case ORTE_PLM_UPDATE_PROC_STATE: + OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, + "%s plm:base:receive update proc state command from %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(sender))); + count = 1; + while (ORTE_SUCCESS == (rc = opal_dss.unpack(buffer, &job, &count, ORTE_JOBID))) { + OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:receive job %s launched", + "%s plm:base:receive got update_proc_state for job %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job))); - - /* pack the jobid to be returned */ - if (ORTE_SUCCESS != (ret = opal_dss.pack(&answer, &job, 1, ORTE_JOBID))) { - ORTE_ERROR_LOG(ret); - } - - /* send the response back to the sender */ - if (0 > (ret = orte_rml.send_buffer(&msgpkt->sender, &answer, ORTE_RML_TAG_PLM_PROXY, 0))) { - ORTE_ERROR_LOG(ret); - } - break; - - case ORTE_PLM_UPDATE_PROC_STATE: - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:receive update proc state command from %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&(msgpkt->sender)) )); - count = 1; + + name.jobid = job; running = false; - while (ORTE_SUCCESS == (rc = opal_dss.unpack(msgpkt->buffer, &job, &count, ORTE_JOBID))) { - - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:receive got update_proc_state for job %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(job))); - - name.jobid = job; - running = false; - /* get the job object */ - if (NULL == (jdata = orte_get_job_data_object(job))) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - goto CLEANUP; - } - /* if we are timing, the daemon will have included the time it - * recvd the launch msg - the maximum time between when we sent - * that message and a daemon recvd it tells us the time reqd - * to wireup the daemon comm network - */ - if (orte_timing) { - int64_t tmpsec, tmpusec; - count = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(msgpkt->buffer, &tmpsec, &count, OPAL_INT64))) { - ORTE_ERROR_LOG(rc); - goto CLEANUP; - } - count = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(msgpkt->buffer, &tmpusec, &count, OPAL_INT64))) { - ORTE_ERROR_LOG(rc); - goto CLEANUP; - } - /* keep the maximum time */ - if (tmpsec > jdata->max_launch_msg_recvd.tv_sec) { - jdata->max_launch_msg_recvd.tv_sec = tmpsec; - jdata->max_launch_msg_recvd.tv_usec = tmpusec; - } else if (tmpsec == jdata->max_launch_msg_recvd.tv_sec && - tmpusec > jdata->max_launch_msg_recvd.tv_usec) { - jdata->max_launch_msg_recvd.tv_usec = tmpusec; - } - if (orte_timing_details) { - int64_t sec, usec; - char *timestr; - ORTE_COMPUTE_TIME_DIFF(sec, usec, jdata->launch_msg_sent.tv_sec, jdata->launch_msg_sent.tv_usec, - tmpsec, tmpusec); - timestr = orte_pretty_print_timing(sec, usec); - fprintf(orte_timing_output, "Time for launch msg to reach daemon %s: %s\n", - ORTE_VPID_PRINT(msgpkt->sender.vpid), timestr); - free(timestr); - } - } - count = 1; - while (ORTE_SUCCESS == (rc = opal_dss.unpack(msgpkt->buffer, &vpid, &count, ORTE_VPID))) { - if (ORTE_VPID_INVALID == vpid) { - /* flag indicates that this job is complete - move on */ - break; - } - name.vpid = vpid; - ORTE_EPOCH_SET(name.epoch,orte_ess.proc_get_epoch(&name)); - - /* unpack the pid */ - count = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(msgpkt->buffer, &pid, &count, OPAL_PID))) { - ORTE_ERROR_LOG(rc); - goto CLEANUP; - } - /* if we are timing things, unpack the time this proc was started */ - if (orte_timing) { - int64_t tmpsec, tmpusec; - count = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(msgpkt->buffer, &tmpsec, &count, OPAL_INT64))) { - ORTE_ERROR_LOG(rc); - goto CLEANUP; - } - count = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(msgpkt->buffer, &tmpusec, &count, OPAL_INT64))) { - ORTE_ERROR_LOG(rc); - goto CLEANUP; - } - if (orte_timing_details) { - time_t tmptime; - char *tmpstr; - tmptime = tmpsec; - tmpstr = ctime(&tmptime); - /* remove the newline and the year at the end */ - tmpstr[strlen(tmpstr)-6] = '\0'; - fprintf(orte_timing_output, "Time rank %s was launched: %s.%3lu\n", - ORTE_VPID_PRINT(vpid), tmpstr, (unsigned long)(tmpusec/1000)); - } - } - /* unpack the state */ - count = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(msgpkt->buffer, &state, &count, ORTE_PROC_STATE))) { - ORTE_ERROR_LOG(rc); - goto CLEANUP; - } - if (ORTE_PROC_STATE_RUNNING == state) { - running = true; - } - /* unpack the exit code */ - count = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(msgpkt->buffer, &exit_code, &count, ORTE_EXIT_CODE))) { - ORTE_ERROR_LOG(rc); - goto CLEANUP; - } - - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:receive got update_proc_state for vpid %lu state %s exit_code %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - (unsigned long)vpid, orte_proc_state_to_str(state), (int)exit_code)); - - /* update the state */ - OPAL_RELEASE_THREAD(&lock, &cond, &processing); - orte_errmgr.update_state(job, ORTE_JOB_STATE_UNDEF, - &name, state, pid, exit_code); - OPAL_ACQUIRE_THREAD(&lock, &cond, &processing); - } - count = 1; - } - if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) { - ORTE_ERROR_LOG(rc); - } else { - rc = ORTE_SUCCESS; - } - jdata->num_daemons_reported++; - if (orte_report_launch_progress && running) { - if (0 == jdata->num_daemons_reported % 100 || jdata->num_daemons_reported == orte_process_info.num_procs) { - opal_output(orte_clean_output, "Reported: %d (out of %d) daemons - %d (out of %d) procs", - (int)jdata->num_daemons_reported, (int)orte_process_info.num_procs, - (int)jdata->num_launched, (int)jdata->num_procs); - } - } - break; - - case ORTE_PLM_INIT_ROUTES_CMD: - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:receive init routes command", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - count=1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(msgpkt->buffer, &job, &count, ORTE_JOBID))) { - ORTE_ERROR_LOG(rc); + /* get the job object */ + if (NULL == (jdata = orte_get_job_data_object(job))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); goto CLEANUP; } - name.jobid = job; - count=1; - while (ORTE_SUCCESS == opal_dss.unpack(msgpkt->buffer, &vpid, &count, ORTE_VPID)) { + count = 1; + while (ORTE_SUCCESS == (rc = opal_dss.unpack(buffer, &vpid, &count, ORTE_VPID))) { if (ORTE_VPID_INVALID == vpid) { + /* flag indicates that this job is complete - move on */ break; } name.vpid = vpid; - -#if ORTE_ENABLE_EPOCH - count=1; - opal_dss.unpack(msgpkt->buffer, &epoch, &count, ORTE_EPOCH); - name.epoch = epoch; -#endif - + /* unpack the pid */ + count = 1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &pid, &count, OPAL_PID))) { + ORTE_ERROR_LOG(rc); + goto CLEANUP; + } + /* unpack the state */ + count = 1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &state, &count, ORTE_PROC_STATE))) { + ORTE_ERROR_LOG(rc); + goto CLEANUP; + } + if (ORTE_PROC_STATE_RUNNING == state) { + running = true; + } + /* unpack the exit code */ + count = 1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &exit_code, &count, ORTE_EXIT_CODE))) { + ORTE_ERROR_LOG(rc); + goto CLEANUP; + } + OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:receive Described rank %s", + "%s plm:base:receive got update_proc_state for vpid %lu state %s exit_code %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&name))); - /* update the errmgr state */ - orte_errmgr.update_state(job, ORTE_JOB_STATE_REGISTERED, - &name, ORTE_PROC_STATE_REGISTERED, - 0, ORTE_ERROR_DEFAULT_EXIT_CODE); - count=1; - } - /* pass the remainder of the buffer to the active module's - * init_routes API - */ - if (ORTE_SUCCESS != (rc = orte_routed.init_routes(job, msgpkt->buffer))) { - ORTE_ERROR_LOG(rc); - } - - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:receive done with init routes command", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - break; + (unsigned long)vpid, orte_proc_state_to_str(state), (int)exit_code)); - default: - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:receive unknown command", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS); - rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS; - break; + /* get the proc data object */ + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, vpid))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); + } + proc->state = state; + proc->pid = pid; + proc->exit_code = exit_code; + ORTE_ACTIVATE_PROC_STATE(&name, state); + } + if (running) { + jdata->num_daemons_reported++; + if (orte_report_launch_progress) { + if (0 == jdata->num_daemons_reported % 100 || + jdata->num_daemons_reported == orte_process_info.num_procs) { + opal_output(orte_clean_output, "App launch reported: %d (out of %d) daemons - %d (out of %d) procs", + (int)jdata->num_daemons_reported, (int)orte_process_info.num_procs, + (int)jdata->num_launched, (int)jdata->num_procs); + } + } + } + /* prepare for next job */ + count = 1; } + if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) { + ORTE_ERROR_LOG(rc); + } else { + rc = ORTE_SUCCESS; + } + break; + + case ORTE_PLM_INIT_ROUTES_CMD: + count=1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &job, &count, ORTE_JOBID))) { + ORTE_ERROR_LOG(rc); + goto CLEANUP; + } + name.jobid = job; + /* get the job object */ + if (NULL == (jdata = orte_get_job_data_object(job))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + goto CLEANUP; + } + count=1; + while (ORTE_SUCCESS == opal_dss.unpack(buffer, &vpid, &count, ORTE_VPID)) { + if (ORTE_VPID_INVALID == vpid) { + break; + } + name.vpid = vpid; + ORTE_ACTIVATE_PROC_STATE(&name, ORTE_PROC_STATE_REGISTERED); + count=1; + } + /* pass the remainder of the buffer to the active module's + * init_routes API + */ + if (ORTE_SUCCESS != (rc = orte_routed.init_routes(job, buffer))) { + ORTE_ERROR_LOG(rc); + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); + } + break; + + default: + ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS); + rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS; + break; + } - CLEANUP: - /* release the message */ - OBJ_RELEASE(msgpkt); - OBJ_DESTRUCT(&answer); - if (ORTE_SUCCESS != rc) { - goto DEPART; - } + CLEANUP: + if (ORTE_SUCCESS != rc) { + goto DEPART; } DEPART: - /* release the thread */ - OPAL_RELEASE_THREAD(&lock, &cond, &processing); - /* see if an error occurred - if so, wakeup the HNP so we can exit */ if (ORTE_PROC_IS_HNP && ORTE_SUCCESS != rc) { - orte_jobs_complete(); + jdata = NULL; + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); } OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, @@ -528,51 +373,8 @@ static void process_msg(int fd, short event, void *data) ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); } -/* - * handle message from proxies - * NOTE: The incoming buffer "buffer" is OBJ_RELEASED by the calling program. - * DO NOT RELEASE THIS BUFFER IN THIS CODE - */ - -void orte_plm_base_recv(int status, orte_process_name_t* sender, - opal_buffer_t* buffer, orte_rml_tag_t tag, - void* cbdata) -{ - int rc; - - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:receive got message from %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(sender))); - - /* don't process this right away - we need to get out of the recv before - * we process the message as it may ask us to do something that involves - * more messaging! Instead, setup an event so that the message gets processed - * as soon as we leave the recv. - * - * The macro makes a copy of the buffer, which we release above - the incoming - * buffer, however, is NOT released here, although its payload IS transferred - * to the message buffer for later processing - */ - ORTE_PROCESS_MESSAGE(&recvs, &lock, processing, ready_fd[1], true, sender, &buffer); - - /* reissue the recv */ - if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, - ORTE_RML_TAG_PLM, - ORTE_RML_NON_PERSISTENT, - orte_plm_base_recv, - NULL))) { - ORTE_ERROR_LOG(rc); - } - - return; -} - /* where HNP messages come */ void orte_plm_base_receive_process_msg(int fd, short event, void *data) { - orte_message_event_t *mev = (orte_message_event_t*)data; - - ORTE_PROCESS_MESSAGE(&recvs, &lock, processing, ready_fd[1], false, &mev->sender, &mev->buffer); - OBJ_RELEASE(mev); + assert(0); } diff --git a/orte/mca/plm/base/plm_private.h b/orte/mca/plm/base/plm_private.h index f23ca24ae1..d363621f83 100644 --- a/orte/mca/plm/base/plm_private.h +++ b/orte/mca/plm/base/plm_private.h @@ -36,7 +36,6 @@ #include "opal/class/opal_list.h" #include "opal/class/opal_pointer_array.h" #include "opal/dss/dss_types.h" -#include "opal/threads/condition.h" #include "opal/dss/dss_types.h" #include "orte/mca/plm/plm_types.h" @@ -51,26 +50,10 @@ BEGIN_C_DECLS typedef struct { /** Verbose/debug output stream */ int output; - /* orted cmd comm lock */ - opal_mutex_t orted_cmd_lock; - /* orted cmd cond */ - opal_condition_t orted_cmd_cond; /* next jobid */ uint16_t next_jobid; /* time when daemons started launch */ struct timeval daemonlaunchstart; - /* spawn lock */ - opal_mutex_t spawn_lock; - /* spawn cond */ - opal_condition_t spawn_cond; - /* spawn status */ - int spawn_status; - /* completion flag */ - bool spawn_complete; - /* spawn in progress cond */ - opal_condition_t spawn_in_progress_cond; - /* flag */ - bool spawn_in_progress; /* tree spawn cmd */ opal_buffer_t tree_spawn_cmd; /* daemon nodes assigned at launch */ @@ -92,18 +75,14 @@ ORTE_DECLSPEC int orte_plm_base_set_progress_sched(int sched); /* * Launch support */ -ORTE_DECLSPEC int orte_plm_base_setup_job(orte_job_t *jdata); -ORTE_DECLSPEC int orte_plm_base_launch_apps(orte_jobid_t job); - -ORTE_DECLSPEC int orte_plm_base_daemon_callback(orte_std_cntr_t num_daemons); - -ORTE_DECLSPEC int orte_plm_base_set_hnp_name(void); - +ORTE_DECLSPEC void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender, + opal_buffer_t *buffer, + orte_rml_tag_t tag, void *cbdata); ORTE_DECLSPEC int orte_plm_base_create_jobid(orte_job_t *jdata); - +ORTE_DECLSPEC int orte_plm_base_set_hnp_name(void); ORTE_DECLSPEC void orte_plm_base_reset_job(orte_job_t *jdata); - ORTE_DECLSPEC int orte_plm_base_setup_orted_cmd(int *argc, char ***argv); +ORTE_DECLSPEC void orte_plm_base_check_all_complete(int fd, short args, void *cbdata); ORTE_DECLSPEC int orte_plm_base_setup_virtual_machine(orte_job_t *jdata); /** diff --git a/orte/mca/plm/ccp/plm_ccp_module.c b/orte/mca/plm/ccp/plm_ccp_module.c index 082c1ee074..7d5c214274 100644 --- a/orte/mca/plm/ccp/plm_ccp_module.c +++ b/orte/mca/plm/ccp/plm_ccp_module.c @@ -483,7 +483,7 @@ static int plm_ccp_launch_job(orte_job_t *jdata) /* Allow some progress to occur */ - opal_event_loop(opal_event_base, OPAL_EVLOOP_NONBLOCK); + opal_event_loop(orte_event_base, OPAL_EVLOOP_NONBLOCK); launched++; diff --git a/orte/mca/plm/lsf/plm_lsf_module.c b/orte/mca/plm/lsf/plm_lsf_module.c index fea17234c2..f61bf5f51a 100644 --- a/orte/mca/plm/lsf/plm_lsf_module.c +++ b/orte/mca/plm/lsf/plm_lsf_module.c @@ -10,7 +10,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2006-2007 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2007-2011 Los Alamos National Security, LLC. All rights + * Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2008 Institut National de Recherche en Informatique * et Automatique. All rights reserved. @@ -66,6 +66,7 @@ #include "orte/runtime/orte_wait.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/rmaps/rmaps.h" +#include "orte/mca/state/state.h" #include "orte/mca/plm/plm.h" #include "orte/mca/plm/base/base.h" @@ -98,6 +99,8 @@ orte_plm_base_module_t orte_plm_lsf_module = { plm_lsf_finalize }; +static void launch_daemons(int fd, short args, void *cbdata); + /** * Init the module */ @@ -122,6 +125,13 @@ int plm_lsf_init(void) orte_plm_globals.daemon_nodes_assigned_at_launch = false; } + /* point to our launch command */ + if (ORTE_SUCCESS != (rc = orte_state.add_job_state(ORTE_JOB_STATE_LAUNCH_DAEMONS, + launch_daemons, ORTE_SYS_PRI))) { + ORTE_ERROR_LOG(rc); + return rc; + } + return rc; } @@ -130,6 +140,18 @@ int plm_lsf_init(void) * the job can cleanly terminate */ static int plm_lsf_launch_job(orte_job_t *jdata) +{ + if (ORTE_JOB_CONTROL_RESTART & jdata->controls) { + /* this is a restart situation - skip to the mapping stage */ + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP); + } else { + /* new job - set it up */ + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_INIT); + } + return ORTE_SUCCESS; +} + +static void launch_daemons(int fd, short args, void *cbdata) { orte_job_map_t *map; size_t num_nodes; @@ -144,25 +166,15 @@ static int plm_lsf_launch_job(orte_job_t *jdata) char *vpid_string; int i; char *cur_prefix; - struct timeval joblaunchstart, launchstart, launchstop; int proc_vpid_index = 0; bool failed_launch = true; orte_app_context_t *app; orte_node_t *node; orte_std_cntr_t nnode; - orte_jobid_t failed_job; - orte_job_state_t job_state = ORTE_JOB_STATE_NEVER_LAUNCHED; orte_job_t *daemons; + orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata; + orte_job_t *jdata = state->jdata; - /* default to declaring the daemons failed*/ - failed_job = ORTE_PROC_MY_NAME->jobid; - - if (orte_timing) { - if (0 != gettimeofday(&joblaunchstart, NULL)) { - opal_output(0, "plm_lsf: could not obtain job start time"); - } - } - /* start by setting up the virtual machine */ daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); if (ORTE_SUCCESS != (rc = orte_plm_base_setup_virtual_machine(jdata))) { @@ -175,11 +187,18 @@ static int plm_lsf_launch_job(orte_job_t *jdata) * look at the proposed process map */ if (orte_do_not_launch) { - goto launch_apps; + /* set the state to indicate the daemons reported - this + * will trigger the daemons_reported event and cause the + * job to move to the following step + */ + state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; + ORTE_ACTIVATE_JOB_STATE(daemons, ORTE_JOB_STATE_DAEMONS_REPORTED); + OBJ_RELEASE(state); + return; } OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:slurm: launching vm", + "%s plm:lsf: launching vm", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); @@ -191,12 +210,18 @@ static int plm_lsf_launch_job(orte_job_t *jdata) } num_nodes = map->num_new_daemons; - if (num_nodes == 0) { - /* have all the daemons we need - launch app */ + if (0 == num_nodes) { + /* set the state to indicate the daemons reported - this + * will trigger the daemons_reported event and cause the + * job to move to the following step + */ OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "%s plm:lsf: no new daemons to launch", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - goto launch_apps; + state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; + ORTE_ACTIVATE_JOB_STATE(daemons, ORTE_JOB_STATE_DAEMONS_REPORTED); + OBJ_RELEASE(state); + return; } /* create nodelist */ @@ -300,15 +325,6 @@ static int plm_lsf_launch_job(orte_job_t *jdata) /* setup environment */ env = opal_argv_copy(orte_launch_environ); - if (orte_timing) { - if (0 != gettimeofday(&launchstart, NULL)) { - opal_output(0, "plm_lsf: could not obtain start time"); - } - } - - /* set the job state to indicate we attempted to launch */ - job_state = ORTE_JOB_STATE_FAILED_TO_START; - /* lsb_launch tampers with SIGCHLD. * After the call to lsb_launch, the signal handler for SIGCHLD is NULL. * So, we disable the SIGCHLD handler of libevent for the duration of @@ -331,55 +347,13 @@ static int plm_lsf_launch_job(orte_job_t *jdata) } orte_wait_enable(); /* re-enable our SIGCHLD handler */ - /* wait for daemons to callback */ - if (ORTE_SUCCESS != - (rc = orte_plm_base_daemon_callback(map->num_new_daemons))) { - OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:lsf: daemon launch failed on error %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_ERROR_NAME(rc))); - goto cleanup; - } + /* indicate that the daemons for this job were launched */ + state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; -launch_apps: - /* setup the job */ - if (ORTE_SUCCESS != (rc = orte_plm_base_setup_job(jdata))) { - ORTE_ERROR_LOG(rc); - failed_job = jdata->jobid; - goto cleanup; - } - /* daemons succeeded - any failure now would be from apps */ - failed_job = jdata->jobid; - if (ORTE_SUCCESS != (rc = orte_plm_base_launch_apps(jdata->jobid))) { - OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:lsf: launch of apps failed for job %s on error %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(jdata->jobid), ORTE_ERROR_NAME(rc))); - goto cleanup; - } - - /* declare the launch a success */ + /* flag that launch was successful, so far as we currently know */ failed_launch = false; - - if (orte_timing) { - if (0 != gettimeofday(&launchstop, NULL)) { - opal_output(0, "plm_lsf: could not obtain stop time"); - } else { - opal_output(0, "plm_lsf: daemon block launch time is %ld usec", - (launchstop.tv_sec - launchstart.tv_sec)*1000000 + - (launchstop.tv_usec - launchstart.tv_usec)); - opal_output(0, "plm_lsf: total job launch time is %ld usec", - (launchstop.tv_sec - joblaunchstart.tv_sec)*1000000 + - (launchstop.tv_usec - joblaunchstart.tv_usec)); - } - } - if (ORTE_SUCCESS != rc) { - opal_output(0, "plm:lsf: start_procs returned error %d", rc); - goto cleanup; - } - -cleanup: + cleanup: if (NULL != argv) { opal_argv_free(argv); } @@ -387,20 +361,13 @@ cleanup: opal_argv_free(env); } + /* cleanup the caddy */ + OBJ_RELEASE(state); + /* check for failed launch - if so, force terminate */ if (failed_launch) { - if (ORTE_ERR_SILENT == rc) { - orte_errmgr.update_state(failed_job, ORTE_JOB_STATE_SILENT_ABORT, - NULL, ORTE_PROC_STATE_UNDEF, - 0, ORTE_ERROR_DEFAULT_EXIT_CODE); - } else { - orte_errmgr.update_state(failed_job, job_state, - NULL, ORTE_PROC_STATE_UNDEF, - 0, ORTE_ERROR_DEFAULT_EXIT_CODE); - } + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); } - - return rc; } diff --git a/orte/mca/plm/plm.h b/orte/mca/plm/plm.h index bb45751508..4b1e3bbd52 100644 --- a/orte/mca/plm/plm.h +++ b/orte/mca/plm/plm.h @@ -9,6 +9,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -53,7 +55,7 @@ BEGIN_C_DECLS typedef int (*orte_plm_base_module_init_fn_t)(void); /* - * Spawn a job + * Spawn a job - this is a non-blocking function! */ typedef int (*orte_plm_base_module_spawn_fn_t)(orte_job_t *jdata); diff --git a/orte/mca/plm/plm_types.h b/orte/mca/plm/plm_types.h index 32801ce377..9be7c661f5 100644 --- a/orte/mca/plm/plm_types.h +++ b/orte/mca/plm/plm_types.h @@ -9,6 +9,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -39,77 +41,118 @@ typedef int32_t orte_exit_code_t; typedef uint32_t orte_proc_state_t; #define ORTE_PROC_STATE_T OPAL_UINT32 +#define ORTE_PROC_STATE_ANY 0xffff + +#define ORTE_PROC_STATE_UNDEF 0 /* undefined process state */ +#define ORTE_PROC_STATE_INIT 1 /* process entry has been created by rmaps */ +#define ORTE_PROC_STATE_RESTART 2 /* the proc is ready for restart */ +#define ORTE_PROC_STATE_TERMINATE 3 /* process is marked for termination */ +#define ORTE_PROC_STATE_RUNNING 4 /* daemon has locally fork'd process */ +#define ORTE_PROC_STATE_REGISTERED 5 /* proc registered sync */ +#define ORTE_PROC_STATE_IOF_COMPLETE 6 /* io forwarding pipes have closed */ +#define ORTE_PROC_STATE_WAITPID_FIRED 7 /* waitpid fired on process */ -#define ORTE_PROC_STATE_UNDEF 0x00000000 /* undefined process state */ -#define ORTE_PROC_STATE_INIT 0x00000001 /* process entry has been created by rmaps */ -#define ORTE_PROC_STATE_RESTART 0x00000002 /* the proc is ready for restart */ -#define ORTE_PROC_STATE_LAUNCHED 0x00000004 /* process has been launched */ -#define ORTE_PROC_STATE_TERMINATE 0x00000008 /* process is marked for termination */ -#define ORTE_PROC_STATE_RUNNING 0x00000010 /* daemon has locally fork'd process */ -#define ORTE_PROC_STATE_REGISTERED 0x00000020 /* process has registered for sync */ /* * Define a "boundary" so we can easily and quickly determine * if a proc is still running or not - any value less than * this one means that we are not terminated */ -#define ORTE_PROC_STATE_UNTERMINATED 0x00000040 +#define ORTE_PROC_STATE_UNTERMINATED 15 + +#define ORTE_PROC_STATE_TERMINATED 20 /* process has terminated and is no longer running */ +/* Define a boundary so we can easily and quickly determine + * if a proc abnormally terminated - leave a little room + * for future expansion + */ +#define ORTE_PROC_STATE_ERROR 50 +/* Define specific error code values */ +#define ORTE_PROC_STATE_KILLED_BY_CMD (ORTE_PROC_STATE_ERROR + 1) /* process was killed by ORTE cmd */ +#define ORTE_PROC_STATE_ABORTED (ORTE_PROC_STATE_ERROR + 2) /* process aborted */ +#define ORTE_PROC_STATE_FAILED_TO_START (ORTE_PROC_STATE_ERROR + 3) /* process failed to start */ +#define ORTE_PROC_STATE_ABORTED_BY_SIG (ORTE_PROC_STATE_ERROR + 4) /* process aborted by signal */ +#define ORTE_PROC_STATE_TERM_WO_SYNC (ORTE_PROC_STATE_ERROR + 5) /* process exit'd w/o required sync */ +#define ORTE_PROC_STATE_COMM_FAILED (ORTE_PROC_STATE_ERROR + 6) /* process communication has failed */ +#define ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED (ORTE_PROC_STATE_ERROR + 7) /* process exceeded a sensor limit */ +#define ORTE_PROC_STATE_CALLED_ABORT (ORTE_PROC_STATE_ERROR + 8) /* process called "errmgr.abort" */ +#define ORTE_PROC_STATE_HEARTBEAT_FAILED (ORTE_PROC_STATE_ERROR + 9) /* heartbeat failed to arrive */ +#define ORTE_PROC_STATE_MIGRATING (ORTE_PROC_STATE_ERROR + 10) /* process failed and is waiting for resources before restarting */ +#define ORTE_PROC_STATE_CANNOT_RESTART (ORTE_PROC_STATE_ERROR + 11) /* process failed and cannot be restarted */ +#define ORTE_PROC_STATE_TERM_NON_ZERO (ORTE_PROC_STATE_ERROR + 12) /* process exited with a non-zero status, indicating abnormal */ +#define ORTE_PROC_STATE_FAILED_TO_LAUNCH (ORTE_PROC_STATE_ERROR + 13) /* unable to launch process */ + +/* Define a boundary so that external developers + * have a starting point for defining their own + * proc states + */ +#define ORTE_PROC_STATE_DYNAMIC 100 + -#define ORTE_PROC_STATE_TERMINATED 0x00000080 /* process has terminated and is no longer running */ -#define ORTE_PROC_STATE_KILLED_BY_CMD 0x00000100 /* process was killed by ORTE cmd */ -#define ORTE_PROC_STATE_ABORTED 0x00000200 /* process aborted */ -#define ORTE_PROC_STATE_FAILED_TO_START 0x00000400 /* process failed to start */ -#define ORTE_PROC_STATE_ABORTED_BY_SIG 0x00000800 /* process aborted by signal */ -#define ORTE_PROC_STATE_TERM_WO_SYNC 0x00001000 /* process exit'd w/o required sync */ -#define ORTE_PROC_STATE_COMM_FAILED 0x00002000 /* process communication has failed */ -#define ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED 0x00004000 /* process exceeded a sensor limit */ -#define ORTE_PROC_STATE_CALLED_ABORT 0x00008000 /* process called "errmgr.abort" */ -#define ORTE_PROC_STATE_HEARTBEAT_FAILED 0x00010000 /* heartbeat failed to arrive */ -#define ORTE_PROC_STATE_MIGRATING 0x00020000 /* process is migrating */ -#define ORTE_PROC_STATE_CANNOT_RESTART 0x00040000 /* process failed and cannot be restarted */ -#define ORTE_PROC_STATE_TERM_NON_ZERO 0x00080000 /* process exited with a non-zero status, indicating abnormal */ -#define ORTE_PROC_STATE_RESTARTED 0x00100000 /* process restarted */ /* * Job state codes */ -typedef uint32_t orte_job_state_t; -#define ORTE_JOB_STATE_T OPAL_UINT32 +typedef int32_t orte_job_state_t; +#define ORTE_JOB_STATE_T OPAL_INT32 +#define ORTE_JOB_STATE_ANY 0xffff + +#define ORTE_JOB_STATE_UNDEF 0 +#define ORTE_JOB_STATE_INIT 1 /* ready to be assigned id */ +#define ORTE_JOB_STATE_ALLOCATE 2 /* ready to be allocated */ +#define ORTE_JOB_STATE_MAP 3 /* ready to be mapped */ +#define ORTE_JOB_STATE_SYSTEM_PREP 4 /* ready for final sanity check and system values updated */ +#define ORTE_JOB_STATE_LAUNCH_DAEMONS 5 /* ready to launch daemons */ +#define ORTE_JOB_STATE_DAEMONS_LAUNCHED 6 /* daemons for this job have been launched */ +#define ORTE_JOB_STATE_DAEMONS_REPORTED 7 /* all launched daemons have reported */ +#define ORTE_JOB_STATE_LAUNCH_APPS 8 /* ready to launch apps */ +#define ORTE_JOB_STATE_RUNNING 9 /* all procs have been fork'd */ +#define ORTE_JOB_STATE_SUSPENDED 10 /* job has been suspended */ +#define ORTE_JOB_STATE_REGISTERED 11 /* all procs registered for sync */ +#define ORTE_JOB_STATE_READY_FOR_DEBUGGERS 12 /* job ready for debugger init after spawn */ +#define ORTE_JOB_STATE_LOCAL_LAUNCH_COMPLETE 13 /* all local procs have attempted launch */ -#define ORTE_JOB_STATE_UNDEF 0x00000000 -#define ORTE_JOB_STATE_INIT 0x00000001 /* job entry has been created by rmaps */ -#define ORTE_JOB_STATE_RESTART 0x00000002 /* the job is ready for restart after one or more procs failed */ -#define ORTE_JOB_STATE_LAUNCHED 0x00000004 /* job has been launched by plm */ -#define ORTE_JOB_STATE_RUNNING 0x00000008 /* all process have been fork'd */ -#define ORTE_JOB_STATE_SUSPENDED 0x00000010 /* job has been suspended */ -#define ORTE_JOB_STATE_REGISTERED 0x00000020 /* all procs registered for sync */ /* * Define a "boundary" so we can easily and quickly determine * if a job is still running or not - any value less than * this one means that we are not terminated */ -#define ORTE_JOB_STATE_UNTERMINATED 0x00000040 +#define ORTE_JOB_STATE_UNTERMINATED 20 -#define ORTE_JOB_STATE_TERMINATED 0x00000080 /* all processes have terminated and is no longer running */ -#define ORTE_JOB_STATE_ABORTED 0x00000100 /* at least one process aborted, causing job to abort */ -#define ORTE_JOB_STATE_FAILED_TO_START 0x00000200 /* at least one process failed to start */ -#define ORTE_JOB_STATE_ABORTED_BY_SIG 0x00000400 /* job was killed by a signal */ -#define ORTE_JOB_STATE_ABORTED_WO_SYNC 0x00000800 /* job was aborted because proc exit'd w/o required sync */ -#define ORTE_JOB_STATE_KILLED_BY_CMD 0x00001000 /* job was killed by ORTE cmd */ -#define ORTE_JOB_STATE_COMM_FAILED 0x00002000 /* communication has failed */ -#define ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED 0x00004000 /* job had a process that exceeded a sensor limit */ -#define ORTE_JOB_STATE_CALLED_ABORT 0x00008000 /* at least one process called "errmgr.abort" */ -#define ORTE_JOB_STATE_HEARTBEAT_FAILED 0x00010000 /* heartbeat failed to arrive */ -#define ORTE_JOB_STATE_PROCS_MIGRATING 0x00020000 /* procs waiting to migrate */ -#define ORTE_JOB_STATE_NON_ZERO_TERM 0x00040000 /* at least one process exited with non-zero status */ -#define ORTE_JOB_STATE_SILENT_ABORT 0x00080000 /* an error occurred and was reported elsewhere, so error out quietly */ +#define ORTE_JOB_STATE_TERMINATED 21 /* all processes have terminated and job is no longer running */ +#define ORTE_JOB_STATE_ALL_JOBS_COMPLETE 22 +#define ORTE_JOB_STATE_DAEMONS_TERMINATED 23 -/* the job never even attempted to launch due to an error earlier in the - * launch procedure +/* Define a boundary so we can easily and quickly determine + * if a job abnormally terminated - leave a little room + * for future expansion */ -#define ORTE_JOB_STATE_NEVER_LAUNCHED 0x10000000 +#define ORTE_JOB_STATE_ERROR 50 +/* Define specific error code values */ +#define ORTE_JOB_STATE_KILLED_BY_CMD (ORTE_JOB_STATE_ERROR + 1) /* job was killed by ORTE cmd */ +#define ORTE_JOB_STATE_ABORTED (ORTE_JOB_STATE_ERROR + 2) /* at least one process aborted, causing job to abort */ +#define ORTE_JOB_STATE_FAILED_TO_START (ORTE_JOB_STATE_ERROR + 3) /* at least one process failed to start */ +#define ORTE_JOB_STATE_ABORTED_BY_SIG (ORTE_JOB_STATE_ERROR + 4) /* job was killed by a signal */ +#define ORTE_JOB_STATE_ABORTED_WO_SYNC (ORTE_JOB_STATE_ERROR + 5) /* job was aborted because proc exit'd w/o required sync */ +#define ORTE_JOB_STATE_COMM_FAILED (ORTE_JOB_STATE_ERROR + 6) /* communication has failed */ +#define ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED (ORTE_JOB_STATE_ERROR + 7) /* job had a process that exceeded a sensor limit */ +#define ORTE_JOB_STATE_CALLED_ABORT (ORTE_JOB_STATE_ERROR + 8) /* at least one process called "errmgr.abort" */ +#define ORTE_JOB_STATE_HEARTBEAT_FAILED (ORTE_JOB_STATE_ERROR + 9) /* heartbeat failed to arrive */ +#define ORTE_JOB_STATE_NEVER_LAUNCHED (ORTE_JOB_STATE_ERROR + 10) /* the job never even attempted to launch due to + * an error earlier in the + * launch procedure + */ +#define ORTE_JOB_STATE_ABORT_ORDERED (ORTE_JOB_STATE_ERROR + 11) /* the processes in this job have been ordered to "die", + * but may not have completed it yet. Don't order it again + */ +#define ORTE_JOB_STATE_NON_ZERO_TERM (ORTE_JOB_STATE_ERROR + 12) /* at least one process exited with non-zero status */ +#define ORTE_JOB_STATE_FAILED_TO_LAUNCH (ORTE_JOB_STATE_ERROR + 13) +#define ORTE_JOB_STATE_FORCED_EXIT (ORTE_JOB_STATE_ERROR + 14) +#define ORTE_JOB_STATE_SILENT_ABORT (ORTE_JOB_STATE_ERROR + 16) /* an error occurred and was reported elsewhere, so error out quietly */ -/* the processes in this job have been ordered to "die", but may not have completed it yet. Don't order it again */ -#define ORTE_JOB_STATE_ABORT_ORDERED 0x20010000 +/* Define a boundary so that external developers + * have a starting point for defining their own + * job states + */ +#define ORTE_JOB_STATE_DYNAMIC 100 /** @@ -137,6 +180,12 @@ orte_node_state_t) */ /** Node is up, but not part of the node pool for jobs */ #define ORTE_NODE_STATE_NOT_INCLUDED 5 +/* Define a boundary so that external developers + * have a starting point for defining their own + * node states + */ +#define ORTE_NODE_STATE_DYNAMIC 100 + /* * PLM commands */ diff --git a/orte/mca/plm/process/plm_process_module.c b/orte/mca/plm/process/plm_process_module.c index 435ec450db..84c96e50fa 100644 --- a/orte/mca/plm/process/plm_process_module.c +++ b/orte/mca/plm/process/plm_process_module.c @@ -89,6 +89,7 @@ #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/ras/ras_types.h" #include "orte/mca/rmaps/rmaps.h" +#include "orte/mca/state/state.h" #include "orte/mca/plm/plm.h" #include "orte/mca/plm/base/base.h" @@ -142,8 +143,38 @@ static const char * orte_plm_process_shell_name[] = { "unknown" }; +typedef struct { + opal_list_item_t super; + int argc; + char **argv; + orte_proc_t *daemon; +} orte_plm_process_caddy_t; +static void caddy_const(orte_plm_process_caddy_t *ptr) +{ + ptr->argv = NULL; + ptr->daemon = NULL; +} +static void caddy_dest(orte_plm_process_caddy_t *ptr) +{ + if (NULL != ptr->argv) { + opal_argv_free(ptr->argv); + } + if (NULL != ptr->daemon) { + OBJ_RELEASE(ptr->daemon); + } +} +OBJ_CLASS_INSTANCE(orte_plm_process_caddy_t, + opal_list_item_t, + caddy_const, caddy_dest); + /* local global storage of timing variables */ static struct timeval joblaunchstart, joblaunchstop; +static int num_in_progress=0; +static opal_list_t launch_list; +static opal_event_t launch_event; + +static void launch_daemons(int fd, short args, void *cbdata); +static void process_launch_list(int fd, short args, void *cbdata); #ifdef _MSC_VER /* @@ -154,7 +185,6 @@ static int wmi_launch_child(char *prefix, char *remote_node, int argc, char **ar static int get_credential(char *node_name); static char *read_remote_registry(uint32_t root, char *sub_key, char *key, char *remote_node, char *ntlm_auth); - /* local global storage of user credential */ static char user_name[CREDUI_MAX_USERNAME_LENGTH+1]; static char user_password[CREDUI_MAX_PASSWORD_LENGTH+1]; @@ -446,10 +476,11 @@ cleanup: /** * Remote spawn process using WMI. */ -static int wmi_launch_child(char *prefix, char *remote_node, int argc, char **argv) +static int wmi_launch_child(char *remote_node, int argc, char **argv) { char *command_line = NULL; int len = 0, pid = -1; + char *prefix; HRESULT hres; IWbemClassObject* pClass_cimv2 = NULL; @@ -509,28 +540,23 @@ static int wmi_launch_child(char *prefix, char *remote_node, int argc, char **ar or Open MPI was configured without ORTE_WANT_ORTERUN_PREFIX_BY_DEFAULT), let's first check the OPENMPI_HOME in user environment variables, and then the OPAL_PREFIX registry value. */ - if( NULL == prefix ) { - if( mca_plm_process_component.remote_env_prefix ) { - char *path = "Environment"; - char *key = "OPENMPI_HOME"; - /* read registry at HKEY_CURRENT_USER - please note: this MUST be the same user as for WMI authorization. */ - char *reg_prefix = read_remote_registry(0x80000001, path, key, remote_node, ntlm_auth); - if( NULL != reg_prefix) { - command_line = generate_commandline(reg_prefix, argc, argv); - } + if( mca_plm_process_component.remote_env_prefix ) { + char *path = "Environment"; + char *key = "OPENMPI_HOME"; + /* read registry at HKEY_CURRENT_USER + please note: this MUST be the same user as for WMI authorization. */ + char *reg_prefix = read_remote_registry(0x80000001, path, key, remote_node, ntlm_auth); + if( NULL != reg_prefix) { + command_line = generate_commandline(reg_prefix, argc, argv); } - if( NULL == command_line && mca_plm_process_component.remote_reg_prefix ) { - char *path = "Software\\Open MPI\\"; - char *key = "OPAL_PREFIX"; - char *reg_prefix = read_remote_registry(0x80000002, path, key, remote_node, ntlm_auth); - if( NULL != reg_prefix) { - command_line = generate_commandline(reg_prefix, argc, argv); - } + } + if( NULL == command_line && mca_plm_process_component.remote_reg_prefix ) { + char *path = "Software\\Open MPI\\"; + char *key = "OPAL_PREFIX"; + char *reg_prefix = read_remote_registry(0x80000002, path, key, remote_node, ntlm_auth); + if( NULL != reg_prefix) { + command_line = generate_commandline(reg_prefix, argc, argv); } - } else { - /* use user specified/default prefix */ - command_line = generate_commandline(prefix, argc, argv); } if ( NULL == command_line ) { @@ -624,6 +650,18 @@ int orte_plm_process_init(void) ORTE_ERROR_LOG(rc); } + /* point to our launch command */ + if (ORTE_SUCCESS != (rc = orte_state.add_job_state(ORTE_JOB_STATE_LAUNCH_DAEMONS, + launch_daemons, ORTE_SYS_PRI))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + /* setup the event for metering the launch */ + OBJ_CONSTRUCT(&launch_list, opal_list_t); + opal_event_set(orte_event_base, &launch_event, -1, 0, process_launch_list, NULL); + opal_event_set_priority(&launch_event, ORTE_SYS_PRI); + /* we assign daemon nodes at launch */ orte_plm_globals.daemon_nodes_assigned_at_launch = true; @@ -894,6 +932,9 @@ static int orte_plm_process_fill_exec_path( char ** exec_path ) static void orte_plm_process_wait_daemon(pid_t pid, int status, void* cbdata) { + orte_job_t *jdata; + orte_plm_process_caddy_t *caddy=(orte_plm_process_caddy_t*)cbdata; + orte_proc_t *daemon=caddy->daemon; unsigned long deltat; if (! WIFEXITED(status) || ! WEXITSTATUS(status) == 0) { @@ -919,34 +960,24 @@ static void orte_plm_process_wait_daemon(pid_t pid, int status, void* cbdata) } else { opal_output(0, "No extra status information is available: %d.", status); } - /* report that the daemon has failed so we break out of the daemon - * callback receive and can exit - */ - orte_errmgr.update_state(ORTE_PROC_MY_NAME->jobid, ORTE_JOB_STATE_FAILED_TO_START, - NULL, ORTE_PROC_STATE_UNDEF, - 0, status); + jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); + + /* note that this daemon failed */ + daemon->state = ORTE_PROC_STATE_FAILED_TO_START; + /* increment the #daemons terminated so we will exit properly */ + jdata->num_terminated++; + /* report that the daemon has failed so we can exit */ + ORTE_ACTIVATE_PROC_STATE(&daemon->name, ORTE_PROC_STATE_FAILED_TO_START); } /* if abnormal exit */ - /* release any waiting threads */ - OPAL_THREAD_LOCK(&mca_plm_process_component.lock); - - if (mca_plm_process_component.num_children-- >= - mca_plm_process_component.num_concurrent || - mca_plm_process_component.num_children == 0) { - opal_condition_signal(&mca_plm_process_component.cond); + /* release any delay */ + --num_in_progress; + if (num_in_progress < mca_plm_process_component.num_concurrent) { + /* trigger continuation of the launch */ + opal_event_active(&launch_event, EV_WRITE, 1); } - - if (mca_plm_process_component.timing && mca_plm_process_component.num_children == 0) { - if (0 != gettimeofday(&joblaunchstop, NULL)) { - opal_output(0, "plm_process: could not obtain job launch stop time"); - } else { - deltat = (joblaunchstop.tv_sec - joblaunchstart.tv_sec)*1000000 + - (joblaunchstop.tv_usec - joblaunchstart.tv_usec); - opal_output(0, "plm_process: total time to launch job is %lu usec", deltat); - } - } - - OPAL_THREAD_UNLOCK(&mca_plm_process_component.lock); + /* cleanup */ + OBJ_RELEASE(caddy); } @@ -960,6 +991,80 @@ static void orte_plm_process_wait_daemon(pid_t pid, int status, void* cbdata) * the job can cleanly terminate */ static int orte_plm_process_launch(orte_job_t *jdata) +{ + if (ORTE_JOB_CONTROL_RESTART & jdata->controls) { + /* this is a restart situation - skip to the mapping stage */ + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP); + } else { + /* new job - set it up */ + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_INIT); + } + return ORTE_SUCCESS; +} + +static void process_launch_list(int fd, short args, void *cbdata) +{ + opal_list_item_t *item; + pid_t pid; + orte_plm_process_caddy_t *caddy; + char **env; + char *param; + + while (num_in_progress < mca_plm_process_component.num_concurrent) { + item = opal_list_remove_first(&launch_list); + if (NULL == item) { + /* we are done */ + break; + } + caddy = (orte_plm_process_caddy_t*)item; + + /* Set signal handlers back to the default. Do this close + to the execve() because the event library may (and likely + will) reset them. If we don't do this, the event + library may have left some set that, at least on some + OS's, don't get reset via fork() or exec(). Hence, the + orted could be unkillable (for example). */ + + set_handler_default(SIGTERM); + set_handler_default(SIGINT); + set_handler_default(SIGCHLD); + + /* setup environment */ + env = opal_argv_copy(orte_launch_environ); + + /* exec the daemon */ + if (0 < opal_output_get_verbosity(orte_plm_globals.output)) { + param = opal_argv_join(caddy->argv, ' '); + OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, + "%s plm:process: executing:\n\t%s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (NULL == param) ? "NULL" : param)); + if (NULL != param) free(param); + } + +#ifdef _MSC_VER + /* launch remote process */ + pid = wmi_launch_child(caddy->daemon->nodename, caddy->argc, caddy->argv); +#else + pid = _spawnve( _P_NOWAIT, caddy->daemon->name, caddy->argv, env); +#endif + + if (pid < 0) { + /* note that this daemon failed */ + caddy->daemon->state = ORTE_PROC_STATE_FAILED_TO_START; + /* report that the daemon has failed so we can exit */ + ORTE_ACTIVATE_PROC_STATE(&(caddy->daemon->name), ORTE_PROC_STATE_FAILED_TO_START); + continue; + } + /* setup callback on sigchild - wait until setup above is complete + * as the callback can occur in the call to orte_wait_cb + */ + orte_wait_cb(pid, orte_plm_process_wait_daemon, (void*)caddy); + num_in_progress++; + } +} + +static void launch_daemons(int fd, short args, void *cbdata) { orte_job_map_t *map = NULL; int proc_vpid_index; @@ -971,12 +1076,13 @@ static int orte_plm_process_launch(orte_job_t *jdata) int argc = 0; int rc; char *lib_base = NULL, *bin_base = NULL; - bool failed_launch = true; orte_app_context_t *app; orte_node_t *node; orte_std_cntr_t nnode; - orte_job_state_t job_state = ORTE_JOB_STATE_NEVER_LAUNCHED; orte_job_t *daemons; + orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata; + orte_job_t *jdata = state->jdata; + orte_plm_process_caddy_t *caddy; if (orte_timing) { if (0 != gettimeofday(&joblaunchstart, NULL)) { @@ -986,25 +1092,28 @@ static int orte_plm_process_launch(orte_job_t *jdata) } } - /* if we don't want to launch, then don't attempt to - * launch the daemons - the user really wants to just - * look at the proposed process map - */ - if (orte_do_not_launch) { - goto launch_apps; - } - - /* start by launching the virtual machine */ + /* setup the virtual machine */ daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); if (ORTE_SUCCESS != (rc = orte_plm_base_setup_virtual_machine(jdata))) { ORTE_ERROR_LOG(rc); goto cleanup; } - OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:process: launching vm", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - + /* if we don't want to launch, then don't attempt to + * launch the daemons - the user really wants to just + * look at the proposed process map + */ + if (orte_do_not_launch) { + /* set the state to indicate the daemons reported - this + * will trigger the daemons_reported event and cause the + * job to move to the following step + */ + state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; + ORTE_ACTIVATE_JOB_STATE(daemons, ORTE_JOB_STATE_DAEMONS_REPORTED); + OBJ_RELEASE(state); + return; + } + /* Get the map for this job */ if (NULL == (map = daemons->map)) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); @@ -1013,13 +1122,20 @@ static int orte_plm_process_launch(orte_job_t *jdata) } if (0 == map->num_new_daemons) { - /* have all the daemons we need - launch app */ - OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:process: no new daemons to launch", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - goto launch_apps; + /* set the state to indicate the daemons reported - this + * will trigger the daemons_reported event and cause the + * job to move to the following step + */ + state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; + ORTE_ACTIVATE_JOB_STATE(daemons, ORTE_JOB_STATE_DAEMONS_REPORTED); + OBJ_RELEASE(state); + return; } + OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, + "%s plm:process: launching vm", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + if (orte_debug_daemons_flag && mca_plm_process_component.num_concurrent < map->num_new_daemons) { /** @@ -1121,9 +1237,6 @@ static int orte_plm_process_launch(orte_job_t *jdata) lib_base = opal_basename(opal_install_dirs.libdir); bin_base = opal_basename(opal_install_dirs.bindir); - /* set the job state to indicate we attempted to launch */ - job_state = ORTE_JOB_STATE_FAILED_TO_START; - /* * Iterate through each of the nodes */ @@ -1151,7 +1264,7 @@ static int orte_plm_process_launch(orte_job_t *jdata) "%s plm:process:launch daemon failed to be defined on node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name)); - return ORTE_ERR_FATAL; + continue; } if (0 < opal_output_get_verbosity(orte_plm_globals.output)) { @@ -1195,7 +1308,7 @@ static int orte_plm_process_launch(orte_job_t *jdata) if( NULL == exec_path ) { rc = orte_plm_process_fill_exec_path (&exec_path); if (ORTE_SUCCESS != rc) { - return rc; + goto cleanup; } } } @@ -1280,95 +1393,47 @@ static int orte_plm_process_launch(orte_job_t *jdata) argv[proc_vpid_index] = strdup(vpid_string); free(vpid_string); - /* Set signal handlers back to the default. Do this close - to the execve() because the event library may (and likely - will) reset them. If we don't do this, the event - library may have left some set that, at least on some - OS's, don't get reset via fork() or exec(). Hence, the - orted could be unkillable (for example). */ - - set_handler_default(SIGTERM); - set_handler_default(SIGINT); - set_handler_default(SIGCHLD); - - /* setup environment */ - env = opal_argv_copy(orte_launch_environ); - - /* exec the daemon */ - if (0 < opal_output_get_verbosity(orte_plm_globals.output)) { - param = opal_argv_join(exec_argv, ' '); - OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:process: executing:\n\t%s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - (NULL == param) ? "NULL" : param)); - if (NULL != param) free(param); - } - -#ifdef _MSC_VER - /* launch remote process */ - pid = wmi_launch_child(prefix_dir, node->name, argc, exec_argv); -#else - pid = _spawnve( _P_NOWAIT, exec_path, exec_argv, env); -#endif - - if (pid < 0) { - failed_launch = true; - rc = ORTE_ERROR; - goto cleanup; - } - /* indicate this daemon has been launched in case anyone is sitting on that trigger */ - node->daemon->state = ORTE_PROC_STATE_LAUNCHED; OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:process: daemon launched (pid %d on %s)\n", + "%s plm:process: adding node %s to launch list", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - pid, nodes[nnode]->name)); - - OPAL_THREAD_LOCK(&mca_plm_process_component.lock); - /* This situation can lead to a deadlock if '--debug-daemons' is set. - * However, the deadlock condition is tested at the begining of this - * function, so we're quite confident it should not happens here. - */ - if (mca_plm_process_component.num_children++ >= - mca_plm_process_component.num_concurrent) { - opal_condition_wait(&mca_plm_process_component.cond, &mca_plm_process_component.lock); - } - OPAL_THREAD_UNLOCK(&mca_plm_process_component.lock); - - /* if required - add delay to avoid problems w/ X11 authentication */ - if (0 < opal_output_get_verbosity(orte_plm_globals.output) - && mca_plm_process_component.delay) { - sleep(mca_plm_process_component.delay); - } + node->name)); + + /* we are in an event, so no need to protect the list */ + caddy = OBJ_NEW(orte_plm_process_caddy_t); + caddy->argc = argc; + caddy->argv = opal_argv_copy(argv); + caddy->daemon = node->daemon; + OBJ_RETAIN(caddy->daemon); + opal_list_append(&launch_list, &caddy->super); } } - /* wait for daemons to callback */ - if (ORTE_SUCCESS != (rc = orte_plm_base_daemon_callback(map->num_new_daemons))) { - OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:process: daemon launch failed on error %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_ERROR_NAME(rc))); - goto cleanup; - } + /* set the job state to indicate the daemons are launched */ + state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; -launch_apps: - /* setup the job */ - if (ORTE_SUCCESS != (rc = orte_plm_base_setup_job(jdata))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - if (ORTE_SUCCESS != (rc = orte_plm_base_launch_apps(jdata->jobid))) { - OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:process: launch of apps failed for job %s on error %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(jdata->jobid), ORTE_ERROR_NAME(rc))); - goto cleanup; - } + /* trigger the event to start processing the launch list */ + OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, + "%s plm:process: activating launch event", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + opal_event_active(&launch_event, EV_WRITE, 1); - /* get here if launch went okay */ - failed_launch = false; + if (NULL != lib_base) { + free(lib_base); + } + if (NULL != bin_base) { + free(bin_base); + } + + if (NULL != argv) { + opal_argv_free(argv); + } + /* now that we've launched the daemons, let the daemon callback + * function determine they are all alive and trigger the next stage + */ + OBJ_RELEASE(state); + return; - cleanup: +cleanup: if (NULL != lib_base) { free(lib_base); } @@ -1380,14 +1445,8 @@ launch_apps: opal_argv_free(argv); } - /* check for failed launch - if so, force terminate */ - if( failed_launch ) { - orte_errmgr.update_state(jdata->jobid, job_state, - NULL, ORTE_PROC_STATE_UNDEF, - 0, ORTE_ERROR_DEFAULT_EXIT_CODE); - } - - return rc; + OBJ_RELEASE(state); + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); } diff --git a/orte/mca/plm/rsh/plm_rsh.h b/orte/mca/plm/rsh/plm_rsh.h index e05f57a41f..0b46177f67 100644 --- a/orte/mca/plm/rsh/plm_rsh.h +++ b/orte/mca/plm/rsh/plm_rsh.h @@ -9,6 +9,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. All rights + * reserved. * Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2011 IBM Corporation. All rights reserved. * $COPYRIGHT$ @@ -56,9 +58,7 @@ struct orte_plm_rsh_component_t { struct timespec delay; int priority; bool tree_spawn; - size_t num_concurrent; - opal_mutex_t lock; - opal_condition_t cond; + int num_concurrent; char *agent; bool assume_same_shell; bool pass_environ_mca_params; diff --git a/orte/mca/plm/rsh/plm_rsh_component.c b/orte/mca/plm/rsh/plm_rsh_component.c index 222f03b388..b6105a7887 100644 --- a/orte/mca/plm/rsh/plm_rsh_component.c +++ b/orte/mca/plm/rsh/plm_rsh_component.c @@ -105,8 +105,6 @@ static int rsh_component_open(void) char *ctmp, **cargv; /* initialize globals */ - OBJ_CONSTRUCT(&mca_plm_rsh_component.lock, opal_mutex_t); - OBJ_CONSTRUCT(&mca_plm_rsh_component.cond, opal_condition_t); mca_plm_rsh_component.using_qrsh = false; mca_plm_rsh_component.using_llspawn = false; @@ -263,10 +261,6 @@ success: static int rsh_component_close(void) { - /* cleanup state */ - OBJ_DESTRUCT(&mca_plm_rsh_component.lock); - OBJ_DESTRUCT(&mca_plm_rsh_component.cond); - return ORTE_SUCCESS; } diff --git a/orte/mca/plm/rsh/plm_rsh_module.c b/orte/mca/plm/rsh/plm_rsh_module.c index 9e0e2851ed..7dc51dcd04 100644 --- a/orte/mca/plm/rsh/plm_rsh_module.c +++ b/orte/mca/plm/rsh/plm_rsh_module.c @@ -88,6 +88,7 @@ #include "orte/mca/rmaps/rmaps.h" #include "orte/mca/routed/routed.h" #include "orte/mca/rml/base/rml_contact.h" +#include "orte/mca/state/state.h" #include "orte/mca/plm/plm.h" #include "orte/mca/plm/base/base.h" @@ -112,6 +113,30 @@ orte_plm_base_module_t orte_plm_rsh_module = { rsh_finalize }; +typedef struct { + opal_list_item_t super; + int argc; + char **argv; + orte_proc_t *daemon; +} orte_plm_rsh_caddy_t; +static void caddy_const(orte_plm_rsh_caddy_t *ptr) +{ + ptr->argv = NULL; + ptr->daemon = NULL; +} +static void caddy_dest(orte_plm_rsh_caddy_t *ptr) +{ + if (NULL != ptr->argv) { + opal_argv_free(ptr->argv); + } + if (NULL != ptr->daemon) { + OBJ_RELEASE(ptr->daemon); + } +} +OBJ_CLASS_INSTANCE(orte_plm_rsh_caddy_t, + opal_list_item_t, + caddy_const, caddy_dest); + typedef enum { ORTE_PLM_RSH_SHELL_BASH = 0, ORTE_PLM_RSH_SHELL_ZSH, @@ -139,24 +164,21 @@ static const char *orte_plm_rsh_shell_name[7] = { static void set_handler_default(int sig); static orte_plm_rsh_shell_t find_shell(char *shell); static int launch_agent_setup(const char *agent, char *path); -static void ssh_child(int argc, char **argv, - orte_vpid_t vpid, int proc_vpid_index) - __opal_attribute_noreturn__; +static void ssh_child(int argc, char **argv) __opal_attribute_noreturn__; static int rsh_probe(char *nodename, orte_plm_rsh_shell_t *shell); static int setup_shell(orte_plm_rsh_shell_t *rshell, orte_plm_rsh_shell_t *lshell, char *nodename, int *argc, char ***argv); - -/* local global storage of timing variables */ -static struct timeval joblaunchstart, joblaunchstop; +static void launch_daemons(int fd, short args, void *cbdata); +static void process_launch_list(int fd, short args, void *cbdata); /* local global storage */ static char *rsh_agent_path=NULL; static char **rsh_agent_argv=NULL; -static opal_list_t my_children; -static size_t num_children, num_active=0; -static orte_vpid_t my_parent; +static int num_in_progress=0; +static opal_list_t launch_list; +static opal_event_t launch_event; /** * Init the module @@ -207,18 +229,27 @@ static int rsh_init(void) return rc; } } - + + /* point to our launch command */ + if (ORTE_SUCCESS != (rc = orte_state.add_job_state(ORTE_JOB_STATE_LAUNCH_DAEMONS, + launch_daemons, ORTE_SYS_PRI))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + /* setup the event for metering the launch */ + OBJ_CONSTRUCT(&launch_list, opal_list_t); + opal_event_set(orte_event_base, &launch_event, -1, 0, process_launch_list, NULL); + opal_event_set_priority(&launch_event, ORTE_SYS_PRI); + + /* start the recvs */ if (ORTE_SUCCESS != (rc = orte_plm_base_comm_start())) { ORTE_ERROR_LOG(rc); } - /* initialize the children tree */ - OBJ_CONSTRUCT(&my_children, opal_list_t); - num_children = 0; - /* we assign daemon nodes at launch */ orte_plm_globals.daemon_nodes_assigned_at_launch = true; - + return rc; } @@ -230,27 +261,36 @@ static void rsh_wait_daemon(pid_t pid, int status, void* cbdata) orte_std_cntr_t cnt=1; uint8_t flag; orte_job_t *jdata; + orte_plm_rsh_caddy_t *caddy=(orte_plm_rsh_caddy_t*)cbdata; + orte_proc_t *daemon=caddy->daemon; + if (orte_orteds_term_ordered || orte_abnormal_term_ordered) { + /* ignore any such report - it will occur if we left the + * session attached, e.g., while debugging + */ + OBJ_RELEASE(caddy); + return; + } + if (! WIFEXITED(status) || ! WEXITSTATUS(status) == 0) { /* if abnormal exit */ /* if we are not the HNP, send a message to the HNP alerting it * to the failure */ if (!ORTE_PROC_IS_HNP) { - opal_buffer_t buf; - orte_vpid_t *vpid=(orte_vpid_t*)cbdata; + opal_buffer_t *buf; OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "%s daemon %d failed with status %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - (int)*vpid, WEXITSTATUS(status))); - OBJ_CONSTRUCT(&buf, opal_buffer_t); - opal_dss.pack(&buf, &cnt, 1, ORTE_STD_CNTR); + (int)daemon->name.vpid, WEXITSTATUS(status))); + buf = OBJ_NEW(opal_buffer_t); + opal_dss.pack(buf, &cnt, 1, ORTE_STD_CNTR); flag = 1; - opal_dss.pack(&buf, &flag, 1, OPAL_UINT8); - opal_dss.pack(&buf, vpid, 1, ORTE_VPID); - orte_rml.send_buffer(ORTE_PROC_MY_HNP, &buf, ORTE_RML_TAG_REPORT_REMOTE_LAUNCH, 0); - OBJ_DESTRUCT(&buf); + opal_dss.pack(buf, &flag, 1, OPAL_UINT8); + opal_dss.pack(buf, &(daemon->name.vpid), 1, ORTE_VPID); + orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buf, + ORTE_RML_TAG_REPORT_REMOTE_LAUNCH, 0, + orte_rml_send_callback, NULL); } else { - orte_proc_t *daemon=(orte_proc_t*)cbdata; jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, @@ -262,21 +302,18 @@ static void rsh_wait_daemon(pid_t pid, int status, void* cbdata) /* increment the #daemons terminated so we will exit properly */ jdata->num_terminated++; /* report that the daemon has failed so we can exit */ - orte_errmgr.update_state(ORTE_PROC_MY_NAME->jobid, ORTE_JOB_STATE_FAILED_TO_START, - NULL, ORTE_PROC_STATE_UNDEF, pid, status); + ORTE_ACTIVATE_PROC_STATE(&daemon->name, ORTE_PROC_STATE_FAILED_TO_START); } } - - /* release any waiting threads */ - OPAL_THREAD_LOCK(&mca_plm_rsh_component.lock); - - if (num_active-- >= mca_plm_rsh_component.num_concurrent || - num_active == 0) { - opal_condition_signal(&mca_plm_rsh_component.cond); + + /* release any delay */ + --num_in_progress; + if (num_in_progress < mca_plm_rsh_component.num_concurrent) { + /* trigger continuation of the launch */ + opal_event_active(&launch_event, EV_WRITE, 1); } - - OPAL_THREAD_UNLOCK(&mca_plm_rsh_component.lock); - + /* cleanup */ + OBJ_RELEASE(caddy); } static int setup_launch(int *argcptr, char ***argvptr, @@ -296,33 +333,33 @@ static int setup_launch(int *argcptr, char ***argvptr, int rc; int cnt, i, j; bool found; - + /* Figure out the basenames for the libdir and bindir. This - requires some explanation: + requires some explanation: - - Use opal_install_dirs.libdir and opal_install_dirs.bindir. + - Use opal_install_dirs.libdir and opal_install_dirs.bindir. - - After a discussion on the devel-core mailing list, the - developers decided that we should use the local directory - basenames as the basis for the prefix on the remote note. - This does not handle a few notable cases (e.g., if the - libdir/bindir is not simply a subdir under the prefix, if the - libdir/bindir basename is not the same on the remote node as - it is here on the local node, etc.), but we decided that - --prefix was meant to handle "the common case". If you need - something more complex than this, a) edit your shell startup - files to set PATH/LD_LIBRARY_PATH properly on the remove - node, or b) use some new/to-be-defined options that - explicitly allow setting the bindir/libdir on the remote - node. We decided to implement these options (e.g., - --remote-bindir and --remote-libdir) to orterun when it - actually becomes a problem for someone (vs. a hypothetical - situation). + - After a discussion on the devel-core mailing list, the + developers decided that we should use the local directory + basenames as the basis for the prefix on the remote note. + This does not handle a few notable cases (e.g., if the + libdir/bindir is not simply a subdir under the prefix, if the + libdir/bindir basename is not the same on the remote node as + it is here on the local node, etc.), but we decided that + --prefix was meant to handle "the common case". If you need + something more complex than this, a) edit your shell startup + files to set PATH/LD_LIBRARY_PATH properly on the remove + node, or b) use some new/to-be-defined options that + explicitly allow setting the bindir/libdir on the remote + node. We decided to implement these options (e.g., + --remote-bindir and --remote-libdir) to orterun when it + actually becomes a problem for someone (vs. a hypothetical + situation). - Hence, for now, we simply take the basename of this install's - libdir and bindir and use it to append this install's prefix - and use that on the remote node. - */ + Hence, for now, we simply take the basename of this install's + libdir and bindir and use it to append this install's prefix + and use that on the remote node. + */ lib_base = opal_basename(opal_install_dirs.libdir); bin_base = opal_basename(opal_install_dirs.bindir); @@ -402,11 +439,11 @@ static int setup_launch(int *argcptr, char ***argvptr, */ char *opal_prefix = getenv("OPAL_PREFIX"); char* full_orted_cmd = NULL; - + if( NULL != orted_cmd ) { asprintf( &full_orted_cmd, "%s/%s/%s", prefix_dir, bin_base, orted_cmd ); } - + if (ORTE_PLM_RSH_SHELL_SH == remote_shell || ORTE_PLM_RSH_SHELL_KSH == remote_shell || ORTE_PLM_RSH_SHELL_ZSH == remote_shell || @@ -431,13 +468,13 @@ static int setup_launch(int *argcptr, char ***argvptr, } else if (ORTE_PLM_RSH_SHELL_TCSH == remote_shell || ORTE_PLM_RSH_SHELL_CSH == remote_shell) { /* [t]csh is a bit more challenging -- we - have to check whether LD_LIBRARY_PATH - is already set before we try to set it. - Must be very careful about obeying - [t]csh's order of evaluation and not - using a variable before it is defined. - See this thread for more details: - http://www.open-mpi.org/community/lists/users/2006/01/0517.php. */ + have to check whether LD_LIBRARY_PATH + is already set before we try to set it. + Must be very careful about obeying + [t]csh's order of evaluation and not + using a variable before it is defined. + See this thread for more details: + http://www.open-mpi.org/community/lists/users/2006/01/0517.php. */ /* if there is nothing preceding orted, then we can just * assemble the cmd with the orted_cmd at the end. Otherwise, * we have to insert the orted_prefix in the right place @@ -502,8 +539,8 @@ static int setup_launch(int *argcptr, char ***argvptr, (mca_plm_rsh_component.using_qrsh && mca_plm_rsh_component.daemonize_qrsh)) && ((!mca_plm_rsh_component.using_llspawn) || (mca_plm_rsh_component.using_llspawn && mca_plm_rsh_component.daemonize_llspawn))) { - opal_argv_append(&argc, &argv, "--daemonize"); - } + opal_argv_append(&argc, &argv, "--daemonize"); + } /* * Add the basic arguments to the orted command line, including @@ -538,7 +575,7 @@ static int setup_launch(int *argcptr, char ***argvptr, free(param); } } - + /* unless told otherwise... */ if (mca_plm_rsh_component.pass_environ_mca_params) { /* now check our local environment for MCA params - add them @@ -583,7 +620,7 @@ static int setup_launch(int *argcptr, char ***argvptr, } } } - + value = opal_argv_join(argv, ' '); if (sysconf(_SC_ARG_MAX) < (int)strlen(value)) { orte_show_help("help-plm-rsh.txt", "cmd-line-too-long", @@ -592,12 +629,12 @@ static int setup_launch(int *argcptr, char ***argvptr, return ORTE_ERR_SILENT; } free(value); - + if (ORTE_PLM_RSH_SHELL_SH == remote_shell || ORTE_PLM_RSH_SHELL_KSH == remote_shell) { opal_argv_append(&argc, &argv, ")"); } - + if (0 < opal_output_get_verbosity(orte_plm_globals.output)) { param = opal_argv_join(argv, ' '); OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, @@ -614,18 +651,16 @@ static int setup_launch(int *argcptr, char ***argvptr, } /* actually ssh the child */ -static void ssh_child(int argc, char **argv, - orte_vpid_t vpid, int proc_vpid_index) +static void ssh_child(int argc, char **argv) { char** env; char* var; long fd, fdmax = sysconf(_SC_OPEN_MAX); - int rc; char *exec_path; char **exec_argv; int fdin; sigset_t sigs; - + /* setup environment */ env = opal_argv_copy(orte_launch_environ); @@ -643,16 +678,6 @@ static void ssh_child(int argc, char **argv, exec_argv = argv; exec_path = strdup(rsh_agent_path); - /* pass the vpid */ - rc = orte_util_convert_vpid_to_string(&var, vpid); - if (ORTE_SUCCESS != rc) { - opal_output(0, "orte_plm_rsh: unable to get daemon vpid as string"); - exit(-1); - } - free(argv[proc_vpid_index]); - argv[proc_vpid_index] = strdup(var); - free(var); - /* Don't let ssh slurp all of our stdin! */ fdin = open("/dev/null", O_RDWR); dup2(fdin, 0); @@ -708,14 +733,16 @@ static int remote_spawn(opal_buffer_t *launch) int node_name_index1; int proc_vpid_index; char **argv = NULL; - char *prefix, *hostname; + char *prefix, *hostname, *var; int argc; int rc; bool failed_launch = true; - pid_t pid; orte_std_cntr_t n; opal_byte_object_t *bo; orte_process_name_t target; + orte_plm_rsh_caddy_t *caddy; + orte_job_t *daemons; + orte_grpcomm_collective_t coll; OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "%s plm:rsh: remote spawn called", @@ -740,28 +767,21 @@ static int remote_spawn(opal_buffer_t *launch) goto cleanup; } - /* ensure the routing tree is updated */ - if (ORTE_SUCCESS != (rc = orte_routed.update_routing_tree(ORTE_PROC_MY_NAME->jobid))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - - /* clear out any previous child info */ - while (NULL != (item = opal_list_remove_first(&my_children))) { - OBJ_RELEASE(item); - } - - /* get the updated routing tree */ - my_parent = orte_routed.get_routing_tree(&my_children); - num_children = opal_list_get_size(&my_children); - + /* ensure the routing plan is updated */ + orte_routed.update_routing_plan(); + + /* get the updated routing list */ + OBJ_CONSTRUCT(&coll, orte_grpcomm_collective_t); + orte_routed.get_routing_list(ORTE_GRPCOMM_XCAST, &coll); + /* if I have no children, just return */ - if (0 == num_children) { + if (0 == opal_list_get_size(&coll.targets)) { OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "%s plm:rsh: remote spawn - have no children!", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); failed_launch = false; rc = ORTE_SUCCESS; + OBJ_DESTRUCT(&coll); goto cleanup; } @@ -769,64 +789,66 @@ static int remote_spawn(opal_buffer_t *launch) if (ORTE_SUCCESS != (rc = setup_launch(&argc, &argv, orte_process_info.nodename, &node_name_index1, &proc_vpid_index, prefix))) { ORTE_ERROR_LOG(rc); + OBJ_DESTRUCT(&coll); + goto cleanup; + } + + /* get the daemon job object */ + if (NULL == (daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + rc = ORTE_ERR_NOT_FOUND; + OBJ_DESTRUCT(&coll); goto cleanup; } target.jobid = ORTE_PROC_MY_NAME->jobid; - for (item = opal_list_get_first(&my_children); - item != opal_list_get_end(&my_children); + for (item = opal_list_get_first(&coll.targets); + item != opal_list_get_end(&coll.targets); item = opal_list_get_next(item)) { - orte_routed_tree_t *child = (orte_routed_tree_t*)item; - target.vpid = child->vpid; + orte_namelist_t *child = (orte_namelist_t*)item; + target.vpid = child->name.vpid; /* get the host where this daemon resides */ if (NULL == (hostname = orte_ess.proc_get_hostname(&target))) { opal_output(0, "%s unable to get hostname for daemon %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_VPID_PRINT(child->vpid)); + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_VPID_PRINT(child->name.vpid)); rc = ORTE_ERR_NOT_FOUND; + OBJ_DESTRUCT(&coll); goto cleanup; } free(argv[node_name_index1]); argv[node_name_index1] = strdup(hostname); - /* fork a child to exec the rsh/ssh session */ - pid = fork(); - if (pid < 0) { - ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_CHILDREN); - rc = ORTE_ERR_SYS_LIMITS_CHILDREN; - goto cleanup; + /* pass the vpid */ + rc = orte_util_convert_vpid_to_string(&var, target.vpid); + if (ORTE_SUCCESS != rc) { + opal_output(0, "orte_plm_rsh: unable to get daemon vpid as string"); + exit(-1); } + free(argv[proc_vpid_index]); + argv[proc_vpid_index] = strdup(var); + free(var); - /* child */ - if (pid == 0) { - OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:rsh: launching on node %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - hostname)); - - /* do the ssh launch - this will exit if it fails */ - ssh_child(argc, argv, child->vpid, proc_vpid_index); - - } - /* father */ - OPAL_THREAD_LOCK(&mca_plm_rsh_component.lock); - /* This situation can lead to a deadlock if '--debug-daemons' is set. - * However, the deadlock condition is tested at the begining of this - * function, so we're quite confident it should not happens here. + /* we are in an event, so no need to protect the list */ + caddy = OBJ_NEW(orte_plm_rsh_caddy_t); + caddy->argc = argc; + caddy->argv = opal_argv_copy(argv); + /* fake a proc structure for the new daemon - will be released + * upon startup */ - if (num_active++ >= mca_plm_rsh_component.num_concurrent) { - opal_condition_wait(&mca_plm_rsh_component.cond, &mca_plm_rsh_component.lock); - } - OPAL_THREAD_UNLOCK(&mca_plm_rsh_component.lock); - - /* setup callback on sigchild - wait until setup above is complete - * as the callback can occur in the call to orte_wait_cb - */ - orte_wait_cb(pid, rsh_wait_daemon, (void*)&child->vpid); + caddy->daemon = OBJ_NEW(orte_proc_t); + caddy->daemon->name.jobid = ORTE_PROC_MY_NAME->jobid; + caddy->daemon->name.vpid = target.vpid; + opal_list_append(&launch_list, &caddy->super); } - - failed_launch = false; + OBJ_DESTRUCT(&coll); + + /* trigger the event to start processing the launch list */ + OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, + "%s plm:rsh: activating launch event", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + opal_event_active(&launch_event, EV_WRITE, 1); cleanup: if (NULL != argv) { @@ -836,108 +858,131 @@ cleanup: /* check for failed launch */ if (failed_launch) { /* report cannot launch this daemon to HNP */ - opal_buffer_t buf; + opal_buffer_t *buf; orte_std_cntr_t cnt=1; uint8_t flag=1; - OBJ_CONSTRUCT(&buf, opal_buffer_t); - opal_dss.pack(&buf, &cnt, 1, ORTE_STD_CNTR); - opal_dss.pack(&buf, &flag, 1, OPAL_UINT8); - opal_dss.pack(&buf, &target.vpid, 1, ORTE_VPID); - orte_rml.send_buffer(ORTE_PROC_MY_HNP, &buf, ORTE_RML_TAG_REPORT_REMOTE_LAUNCH, 0); - OBJ_DESTRUCT(&buf); + buf = OBJ_NEW(opal_buffer_t); + opal_dss.pack(buf, &cnt, 1, ORTE_STD_CNTR); + opal_dss.pack(buf, &flag, 1, OPAL_UINT8); + opal_dss.pack(buf, &target.vpid, 1, ORTE_VPID); + orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buf, + ORTE_RML_TAG_REPORT_REMOTE_LAUNCH, 0, + orte_rml_send_callback, NULL); } return rc; } -/** +/* * Launch a daemon (bootproxy) on each node. The daemon will be responsible * for launching the application. */ -/* When working in this function, ALWAYS jump to "cleanup" if - * you encounter an error so that orterun will be woken up and - * the job can cleanly terminate - */ static int rsh_launch(orte_job_t *jdata) +{ + if (ORTE_JOB_CONTROL_RESTART & jdata->controls) { + /* this is a restart situation - skip to the mapping stage */ + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP); + } else { + /* new job - set it up */ + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_INIT); + } + return ORTE_SUCCESS; +} + +static void process_launch_list(int fd, short args, void *cbdata) +{ + opal_list_item_t *item; + pid_t pid; + orte_plm_rsh_caddy_t *caddy; + + while (num_in_progress < mca_plm_rsh_component.num_concurrent) { + item = opal_list_remove_first(&launch_list); + if (NULL == item) { + /* we are done */ + break; + } + caddy = (orte_plm_rsh_caddy_t*)item; + + /* fork a child to exec the rsh/ssh session */ + pid = fork(); + if (pid < 0) { + ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_CHILDREN); + continue; + } + + /* child */ + if (pid == 0) { + /* do the ssh launch - this will exit if it fails */ + ssh_child(caddy->argc, caddy->argv); + } else { /* father */ + /* indicate this daemon has been launched */ + caddy->daemon->state = ORTE_PROC_STATE_RUNNING; + /* record the pid */ + caddy->daemon->pid = pid; + + OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, + "%s plm:rsh: recording launch of daemon %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&(caddy->daemon->name)))); + + /* setup callback on sigchild - wait until setup above is complete + * as the callback can occur in the call to orte_wait_cb + */ + orte_wait_cb(pid, rsh_wait_daemon, (void*)caddy); + num_in_progress++; + } + } +} + +static void launch_daemons(int fd, short args, void *cbdata) { orte_job_map_t *map = NULL; int node_name_index1; int proc_vpid_index; char **argv = NULL; - char *prefix_dir; + char *prefix_dir=NULL, *var; int argc; int rc; - bool failed_launch = true; orte_app_context_t *app; orte_node_t *node, *nd; orte_std_cntr_t nnode; - orte_jobid_t failed_job; - orte_job_state_t job_state = ORTE_JOB_STATE_NEVER_LAUNCHED; opal_list_item_t *item; orte_job_t *daemons; + orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata; + orte_plm_rsh_caddy_t *caddy; + orte_grpcomm_collective_t coll; - /* wait for the launch to complete */ - OPAL_THREAD_LOCK(&orte_plm_globals.spawn_lock); - while (orte_plm_globals.spawn_in_progress) { - opal_condition_wait(&orte_plm_globals.spawn_in_progress_cond, &orte_plm_globals.spawn_lock); - } - OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "released to spawn")); - orte_plm_globals.spawn_in_progress = true; - orte_plm_globals.spawn_status = ORTE_ERR_FATAL; - OPAL_THREAD_UNLOCK(&orte_plm_globals.spawn_lock); - /* if we are launching debugger daemons, then just go * do it - no new daemons will be launched */ - if (ORTE_JOB_CONTROL_DEBUGGER_DAEMON & jdata->controls) { - failed_job = jdata->jobid; - if (ORTE_SUCCESS != (rc = orte_plm_base_launch_apps(jdata->jobid))) { - ORTE_ERROR_LOG(rc); - failed_launch = true; - goto cleanup; - } - /* wait for the launch to complete */ - OPAL_THREAD_LOCK(&orte_plm_globals.spawn_lock); - while (!orte_plm_globals.spawn_complete) { - opal_condition_wait(&orte_plm_globals.spawn_cond, &orte_plm_globals.spawn_lock); - } - OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "completed spawn for job %s", ORTE_JOBID_PRINT(jdata->jobid))); - orte_plm_globals.spawn_in_progress = false; - opal_condition_broadcast(&orte_plm_globals.spawn_in_progress_cond); - OPAL_THREAD_UNLOCK(&orte_plm_globals.spawn_lock); - failed_launch = false; - goto cleanup; - } - - /* default to declaring the daemon launch as having failed */ - failed_job = ORTE_PROC_MY_NAME->jobid; - - /* if we are timing, record the start time */ - if (orte_timing) { - gettimeofday(&orte_plm_globals.daemonlaunchstart, NULL); - joblaunchstart = orte_plm_globals.daemonlaunchstart; + if (ORTE_JOB_CONTROL_DEBUGGER_DAEMON & state->jdata->controls) { + ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED); + OBJ_RELEASE(state); + return; } /* setup the virtual machine */ daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); - if (ORTE_SUCCESS != (rc = orte_plm_base_setup_virtual_machine(jdata))) { + if (ORTE_SUCCESS != (rc = orte_plm_base_setup_virtual_machine(state->jdata))) { ORTE_ERROR_LOG(rc); goto cleanup; } - + /* if we don't want to launch, then don't attempt to * launch the daemons - the user really wants to just * look at the proposed process map */ if (orte_do_not_launch) { - goto launch_apps; + /* set the state to indicate the daemons reported - this + * will trigger the daemons_reported event and cause the + * job to move to the following step + */ + state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; + ORTE_ACTIVATE_JOB_STATE(daemons, ORTE_JOB_STATE_DAEMONS_REPORTED); + OBJ_RELEASE(state); + return; } - - OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:rsh: launching vm", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* Get the map for this job */ if (NULL == (map = daemons->map)) { @@ -947,16 +992,23 @@ static int rsh_launch(orte_job_t *jdata) } if (0 == map->num_new_daemons) { - /* have all the daemons we need - launch app */ - OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:rsh: no new daemons to launch", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - goto launch_apps; + /* set the state to indicate the daemons reported - this + * will trigger the daemons_reported event and cause the + * job to move to the following step + */ + state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; + ORTE_ACTIVATE_JOB_STATE(daemons, ORTE_JOB_STATE_DAEMONS_REPORTED); + OBJ_RELEASE(state); + return; } + OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, + "%s plm:rsh: launching vm", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + if ((0 < opal_output_get_verbosity(orte_plm_globals.output) || orte_leave_session_attached) && - mca_plm_rsh_component.num_concurrent < (size_t)map->num_new_daemons) { + mca_plm_rsh_component.num_concurrent < map->num_new_daemons) { /** * If we are in '--debug-daemons' we keep the ssh connection * alive for the span of the run. If we use this option @@ -972,8 +1024,9 @@ static int rsh_launch(orte_job_t *jdata) */ orte_show_help("help-plm-rsh.txt", "deadlock-params", true, mca_plm_rsh_component.num_concurrent, map->num_new_daemons); - rc = ORTE_ERR_FATAL; - goto cleanup; + ORTE_ERROR_LOG(ORTE_ERR_FATAL); + OBJ_RELEASE(state); + return; } /* @@ -993,7 +1046,8 @@ static int rsh_launch(orte_job_t *jdata) * Since there always MUST be at least one app_context, we are safe in * doing this. */ - app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, 0); + app = (orte_app_context_t*)opal_pointer_array_get_item(state->jdata->apps, 0); + prefix_dir = app->prefix_dir; /* we also need at least one node name so we can check what shell is * being used, if we have to */ @@ -1010,12 +1064,6 @@ static int rsh_launch(orte_job_t *jdata) } } } - if (NULL == node) { - /* well, if there isn't even one node in the map, then we are hammered */ - rc = ORTE_ERR_FATAL; - goto cleanup; - } - prefix_dir = app->prefix_dir; /* setup the launch */ if (ORTE_SUCCESS != (rc = setup_launch(&argc, &argv, node->name, &node_name_index1, @@ -1023,12 +1071,14 @@ static int rsh_launch(orte_job_t *jdata) ORTE_ERROR_LOG(rc); goto cleanup; } - + + /* if we are tree launching, find our children and create the launch cmd */ if (mca_plm_rsh_component.tree_spawn) { orte_daemon_cmd_flag_t command = ORTE_DAEMON_TREE_SPAWN; opal_byte_object_t bo, *boptr; - + orte_job_t *jdatorted; + orte_tree_launch_cmd= OBJ_NEW(opal_buffer_t); /* insert the tree_spawn cmd */ if (ORTE_SUCCESS != (rc = opal_dss.pack(orte_tree_launch_cmd, &command, 1, ORTE_DAEMON_CMD))) { @@ -1058,43 +1108,44 @@ static int rsh_launch(orte_job_t *jdata) } /* release the data since it has now been copied into our buffer */ free(bo.bytes); - /* clear out any previous child info */ - while (NULL != (item = opal_list_remove_first(&my_children))) { - OBJ_RELEASE(item); + /* get the orted job data object */ + if (NULL == (jdatorted = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + rc = ORTE_ERR_NOT_FOUND; + goto cleanup; } - - /* get the updated routing tree */ - my_parent = orte_routed.get_routing_tree(&my_children); - num_children = opal_list_get_size(&my_children); + + /* get the updated routing list */ + OBJ_CONSTRUCT(&coll, orte_grpcomm_collective_t); + orte_routed.get_routing_list(ORTE_GRPCOMM_XCAST, &coll); } - /* set the job state to indicate we attempted to launch */ - job_state = ORTE_JOB_STATE_FAILED_TO_START; - /* * Iterate through each of the nodes */ for (nnode=0; nnode < map->nodes->size; nnode++) { - pid_t pid; - if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, nnode))) { continue; } /* if we are tree launching, only launch our own children */ if (mca_plm_rsh_component.tree_spawn) { - for (item = opal_list_get_first(&my_children); - item != opal_list_get_end(&my_children); + for (item = opal_list_get_first(&coll.targets); + item != opal_list_get_end(&coll.targets); item = opal_list_get_next(item)) { - orte_routed_tree_t *child = (orte_routed_tree_t*)item; - if (child->vpid == node->daemon->name.vpid) { + orte_namelist_t *child = (orte_namelist_t*)item; + if (child->name.vpid == node->daemon->name.vpid) { goto launch; } } /* didn't find it - ignore this node */ + OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, + "%s plm:rsh:launch daemon %s not a child of mine", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_VPID_PRINT(node->daemon->name.vpid))); continue; } - + launch: /* if this daemon already exists, don't launch it! */ if (node->daemon_launched) { @@ -1114,8 +1165,7 @@ static int rsh_launch(orte_job_t *jdata) "%s plm:rsh:launch daemon failed to be defined on node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name)); - rc = ORTE_ERR_FATAL; - goto cleanup; + continue; } /* setup node name */ @@ -1127,140 +1177,49 @@ static int rsh_launch(orte_job_t *jdata) } else { argv[node_name_index1] = strdup(node->name); } - + + /* pass the vpid */ + rc = orte_util_convert_vpid_to_string(&var, node->daemon->name.vpid); + if (ORTE_SUCCESS != rc) { + opal_output(0, "orte_plm_rsh: unable to get daemon vpid as string"); + exit(-1); + } + free(argv[proc_vpid_index]); + argv[proc_vpid_index] = strdup(var); + free(var); + OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:rsh: launching on node %s", + "%s plm:rsh: adding node %s to launch list", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name)); - - /* fork a child to exec the rsh/ssh session */ - pid = fork(); - if (pid < 0) { - ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_CHILDREN); - rc = ORTE_ERR_SYS_LIMITS_CHILDREN; - goto cleanup; - } - - /* child */ - if (pid == 0) { - - /* do the ssh launch - this will exit if it fails */ - ssh_child(argc, argv, node->daemon->name.vpid, proc_vpid_index); - - - } else { /* father */ - /* indicate this daemon has been launched */ - node->daemon->state = ORTE_PROC_STATE_LAUNCHED; - /* record the pid */ - node->daemon->pid = pid; - - OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:rsh: recording launch of daemon %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&node->daemon->name))); - - /* setup callback on sigchild - wait until setup above is complete - * as the callback can occur in the call to orte_wait_cb - */ - orte_wait_cb(pid, rsh_wait_daemon, (void*)node->daemon); - - OPAL_THREAD_LOCK(&mca_plm_rsh_component.lock); - /* This situation can lead to a deadlock if '--debug-daemons' is set. - * However, the deadlock condition is tested at the begining of this - * function, so we're quite confident it should not happens here. - */ - if (num_active++ >= mca_plm_rsh_component.num_concurrent) { - opal_condition_wait(&mca_plm_rsh_component.cond, &mca_plm_rsh_component.lock); - } - OPAL_THREAD_UNLOCK(&mca_plm_rsh_component.lock); - - /* if required - add delay to avoid problems w/ X11 authentication */ - if (0 < mca_plm_rsh_component.delay.tv_sec || - 0 < mca_plm_rsh_component.delay.tv_nsec) { - OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:rsh: adding delay of %ds:%dusec to launch cycle", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - (int)mca_plm_rsh_component.delay.tv_sec, - (int)mca_plm_rsh_component.delay.tv_nsec/1000)); - nanosleep(&mca_plm_rsh_component.delay, NULL); - } - } - } - - /* wait for daemons to callback - even those launched via tree will - * callback to mpirun, so all we need do is track them here - */ - if (ORTE_SUCCESS != (rc = orte_plm_base_daemon_callback(map->num_new_daemons))) { - OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:tm: daemon launch failed on error %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_ERROR_NAME(rc))); - goto cleanup; - } - - launch_apps: - /* if we get here, then the daemons succeeded, so any failure would now be - * for the application job - */ - /* setup the job */ - if (ORTE_SUCCESS != (rc = orte_plm_base_setup_job(jdata))) { - ORTE_ERROR_LOG(rc); - failed_job = jdata->jobid; - goto cleanup; - } - failed_job = jdata->jobid; - - if (ORTE_SUCCESS != (rc = orte_plm_base_launch_apps(jdata->jobid))) { - OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:rsh: launch of apps failed for job %s on error %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(jdata->jobid), ORTE_ERROR_NAME(rc))); - goto cleanup; - } - - /* wait for the launch to complete */ - OPAL_THREAD_LOCK(&orte_plm_globals.spawn_lock); - while (!orte_plm_globals.spawn_complete) { - opal_condition_wait(&orte_plm_globals.spawn_cond, &orte_plm_globals.spawn_lock); + + /* we are in an event, so no need to protect the list */ + caddy = OBJ_NEW(orte_plm_rsh_caddy_t); + caddy->argc = argc; + caddy->argv = opal_argv_copy(argv); + caddy->daemon = node->daemon; + OBJ_RETAIN(caddy->daemon); + opal_list_append(&launch_list, &caddy->super); } + + /* set the job state to indicate the daemons are launched */ + state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; + + /* trigger the event to start processing the launch list */ OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "completed spawn for job %s", ORTE_JOBID_PRINT(jdata->jobid))); - orte_plm_globals.spawn_in_progress = false; - opal_condition_broadcast(&orte_plm_globals.spawn_in_progress_cond); - OPAL_THREAD_UNLOCK(&orte_plm_globals.spawn_lock); + "%s plm:rsh: activating launch event", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + opal_event_active(&launch_event, EV_WRITE, 1); - /* get here if launch went okay */ - failed_launch = false; + /* now that we've launched the daemons, let the daemon callback + * function determine they are all alive and trigger the next stage + */ + OBJ_RELEASE(state); + return; - if (orte_timing ) { - if (0 != gettimeofday(&joblaunchstop, NULL)) { - opal_output(0, "plm_rsh: could not obtain job launch stop time"); - } else { - opal_output(0, "plm_rsh: total job launch time is %ld usec", - (joblaunchstop.tv_sec - joblaunchstart.tv_sec)*1000000 + - (joblaunchstop.tv_usec - joblaunchstart.tv_usec)); - } - } - - cleanup: - if (NULL != argv) { - opal_argv_free(argv); - } - - /* check for failed launch - if so, force terminate */ - if (failed_launch) { - if (ORTE_ERR_SILENT == rc) { - orte_errmgr.update_state(failed_job, ORTE_JOB_STATE_SILENT_ABORT, - NULL, ORTE_PROC_STATE_UNDEF, - 0, ORTE_ERROR_DEFAULT_EXIT_CODE); - } else { - orte_errmgr.update_state(failed_job, job_state, - NULL, ORTE_PROC_STATE_UNDEF, - 0, ORTE_ERROR_DEFAULT_EXIT_CODE); - } - } - - return rc; +cleanup: + OBJ_RELEASE(state); + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); } /** @@ -1294,20 +1253,16 @@ static int rsh_terminate_orteds(void) static int rsh_finalize(void) { int rc; - opal_list_item_t *item; - + + /* remove launch event */ + opal_event_del(&launch_event); + OBJ_DESTRUCT(&launch_list); + /* cleanup any pending recvs */ if (ORTE_SUCCESS != (rc = orte_plm_base_comm_stop())) { ORTE_ERROR_LOG(rc); } - - /* cleanup the children tree */ - while (NULL != (item = opal_list_remove_first(&my_children))) { - OBJ_RELEASE(item); - } - OBJ_DESTRUCT(&my_children); - num_children = 0; - + return rc; } @@ -1315,11 +1270,11 @@ static int rsh_finalize(void) static void set_handler_default(int sig) { struct sigaction act; - + act.sa_handler = SIG_DFL; act.sa_flags = 0; sigemptyset(&act.sa_mask); - + sigaction(sig, &act, (struct sigaction *)0); } @@ -1328,18 +1283,18 @@ static orte_plm_rsh_shell_t find_shell(char *shell) { int i = 0; char *sh_name = NULL; - + if( (NULL == shell) || (strlen(shell) == 1) ) { /* Malformed shell */ return ORTE_PLM_RSH_SHELL_UNKNOWN; } - + sh_name = rindex(shell, '/'); if( NULL == sh_name ) { /* Malformed shell */ return ORTE_PLM_RSH_SHELL_UNKNOWN; } - + /* skip the '/' */ ++sh_name; for (i = 0; i < (int)(sizeof (orte_plm_rsh_shell_name) / @@ -1348,7 +1303,7 @@ static orte_plm_rsh_shell_t find_shell(char *shell) return (orte_plm_rsh_shell_t)i; } } - + /* We didn't find it */ return ORTE_PLM_RSH_SHELL_UNKNOWN; } @@ -1377,8 +1332,8 @@ static int launch_agent_setup(const char *agent, char *path) } /* see if we can find the agent in the path */ - rsh_agent_path = opal_path_findv(rsh_agent_argv[0], X_OK, environ, path); - + rsh_agent_path = opal_path_findv(rsh_agent_argv[0], X_OK, environ, path); + if (NULL == rsh_agent_path) { /* not an error - just report not found */ opal_argv_free(rsh_agent_argv); @@ -1421,12 +1376,12 @@ static int rsh_probe(char *nodename, int fd[2]; pid_t pid; char outbuf[4096]; - + OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "%s plm:rsh: going to check SHELL variable on node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), nodename)); - + *shell = ORTE_PLM_RSH_SHELL_UNKNOWN; if (pipe(fd)) { OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, @@ -1455,7 +1410,7 @@ static int rsh_probe(char *nodename, argc = opal_argv_count(rsh_agent_argv); opal_argv_append(&argc, &argv, nodename); opal_argv_append(&argc, &argv, "echo $SHELL"); - + execvp(argv[0], argv); exit(errno); } @@ -1466,12 +1421,12 @@ static int rsh_probe(char *nodename, errno)); return ORTE_ERR_IN_ERRNO; } - + { ssize_t ret = 1; char* ptr = outbuf; size_t outbufsize = sizeof(outbuf); - + do { ret = read (fd[0], ptr, outbufsize-1); if (ret < 0) { @@ -1492,7 +1447,7 @@ static int rsh_probe(char *nodename, *ptr = '\0'; } close(fd[0]); - + if( outbuf[0] != '\0' ) { char *sh_name = rindex(outbuf, '/'); if( NULL != sh_name ) { @@ -1507,13 +1462,13 @@ static int rsh_probe(char *nodename, } } } - + OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "%s plm:rsh: node %s has SHELL: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), nodename, (ORTE_PLM_RSH_SHELL_UNKNOWN == *shell) ? "UNHANDLED" : (char*)orte_plm_rsh_shell_name[*shell])); - + return rc; } @@ -1525,7 +1480,7 @@ static int setup_shell(orte_plm_rsh_shell_t *rshell, struct passwd *p; char *param; int rc; - + /* What is our local shell? */ local_shell = ORTE_PLM_RSH_SHELL_UNKNOWN; p = getpwuid(getuid()); @@ -1538,7 +1493,7 @@ static int setup_shell(orte_plm_rsh_shell_t *rshell, } param = p->pw_shell; local_shell = find_shell(p->pw_shell); - + /* If we didn't find it in getpwuid(), try looking at the $SHELL environment variable (see https://svn.open-mpi.org/trac/ompi/ticket/1060) */ diff --git a/orte/mca/plm/slurm/plm_slurm_module.c b/orte/mca/plm/slurm/plm_slurm_module.c index f15e168a44..ae87d150ae 100644 --- a/orte/mca/plm/slurm/plm_slurm_module.c +++ b/orte/mca/plm/slurm/plm_slurm_module.c @@ -10,7 +10,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2006-2007 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2007-2011 Los Alamos National Security, LLC. All rights + * Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ * @@ -68,6 +68,7 @@ #include "orte/runtime/orte_quit.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/rmaps/rmaps.h" +#include "orte/mca/state/state.h" #include "orte/orted/orted.h" @@ -110,6 +111,7 @@ orte_plm_base_module_1_0_0_t orte_plm_slurm_module = { static pid_t primary_srun_pid = 0; static bool primary_pid_set = false; static bool launching_daemons; +static void launch_daemons(int fd, short args, void *cbdata); /** * Init the module @@ -120,6 +122,7 @@ static int plm_slurm_init(void) if (ORTE_SUCCESS != (rc = orte_plm_base_comm_start())) { ORTE_ERROR_LOG(rc); + return rc; } /* if we don't want to launch (e.g., someone just wants @@ -138,6 +141,13 @@ static int plm_slurm_init(void) orte_plm_globals.daemon_nodes_assigned_at_launch = false; } + /* point to our launch command */ + if (ORTE_SUCCESS != (rc = orte_state.add_job_state(ORTE_JOB_STATE_LAUNCH_DAEMONS, + launch_daemons, ORTE_SYS_PRI))) { + ORTE_ERROR_LOG(rc); + return rc; + } + return rc; } @@ -146,6 +156,18 @@ static int plm_slurm_init(void) * the job can cleanly terminate */ static int plm_slurm_launch_job(orte_job_t *jdata) +{ + if (ORTE_JOB_CONTROL_RESTART & jdata->controls) { + /* this is a restart situation - skip to the mapping stage */ + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP); + } else { + /* new job - set it up */ + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_INIT); + } + return ORTE_SUCCESS; +} + +static void launch_daemons(int fd, short args, void *cbdata) { orte_app_context_t *app; orte_node_t *node; @@ -164,68 +186,46 @@ static int plm_slurm_launch_job(orte_job_t *jdata) char **custom_strings; int num_args, i; char *cur_prefix; - struct timeval launchstart, launchstop; int proc_vpid_index; - orte_jobid_t failed_job; bool failed_launch=true; orte_job_t *daemons; + orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata; + + OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, + "%s plm:slurm: LAUNCH DAEMONS CALLED", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* if we are launching debugger daemons, then just go * do it - no new daemons will be launched */ - if (ORTE_JOB_CONTROL_DEBUGGER_DAEMON & jdata->controls) { - failed_job = jdata->jobid; - if (ORTE_SUCCESS != (rc = orte_plm_base_launch_apps(jdata->jobid))) { - ORTE_ERROR_LOG(rc); - failed_launch = true; - } else { - failed_launch = false; - } - goto cleanup; - } - - /* if we are timing, record the start time */ - if (orte_timing) { - gettimeofday(&orte_plm_globals.daemonlaunchstart, NULL); - } - - /* flag the daemons as failing by default */ - failed_job = ORTE_PROC_MY_NAME->jobid; - - if (orte_timing) { - if (0 != gettimeofday(&launchstart, NULL)) { - opal_output(0, "plm_slurm: could not obtain job start time"); - launchstart.tv_sec = 0; - launchstart.tv_usec = 0; - } - } - - /* if we don't want to launch, then don't attempt to - * launch the daemons - the user really wants to just - * look at the proposed process map - */ - if (orte_do_not_launch) { - goto launch_apps; + if (ORTE_JOB_CONTROL_DEBUGGER_DAEMON & state->jdata->controls) { + ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED); + OBJ_RELEASE(state); + return; } /* start by setting up the virtual machine */ daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); - if (ORTE_SUCCESS != (rc = orte_plm_base_setup_virtual_machine(jdata))) { + if (ORTE_SUCCESS != (rc = orte_plm_base_setup_virtual_machine(state->jdata))) { ORTE_ERROR_LOG(rc); goto cleanup; } - /* indicate the state of the launch */ - launching_daemons = true; - - OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:slurm: launching vm", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - - /* set the active jobid */ - failed_job = daemons->jobid; - + /* if we don't want to launch, then don't attempt to + * launch the daemons - the user really wants to just + * look at the proposed process map + */ + if (orte_do_not_launch) { + /* set the state to indicate the daemons reported - this + * will trigger the daemons_reported event and cause the + * job to move to the following step + */ + state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; + ORTE_ACTIVATE_JOB_STATE(daemons, ORTE_JOB_STATE_DAEMONS_REPORTED); + OBJ_RELEASE(state); + return; + } + /* Get the map for this job */ if (NULL == (map = daemons->map)) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); @@ -234,11 +234,17 @@ static int plm_slurm_launch_job(orte_job_t *jdata) } if (0 == map->num_new_daemons) { - /* no new daemons required - just launch apps */ + /* set the state to indicate the daemons reported - this + * will trigger the daemons_reported event and cause the + * job to move to the following step + */ OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "%s plm:slurm: no new daemons to launch", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - goto launch_apps; + state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; + ORTE_ACTIVATE_JOB_STATE(daemons, ORTE_JOB_STATE_DAEMONS_REPORTED); + OBJ_RELEASE(state); + return; } /* need integer value for command line parameter */ @@ -352,9 +358,9 @@ static int plm_slurm_launch_job(orte_job_t *jdata) don't support different --prefix'es for different nodes in the SLURM plm) */ cur_prefix = NULL; - for (n=0; n < jdata->apps->size; n++) { + for (n=0; n < state->jdata->apps->size; n++) { char * app_prefix_dir; - if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, n))) { + if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(state->jdata->apps, n))) { continue; } app_prefix_dir = app->prefix_dir; @@ -365,7 +371,7 @@ static int plm_slurm_launch_job(orte_job_t *jdata) 0 != strcmp (cur_prefix, app_prefix_dir)) { orte_show_help("help-plm-slurm.txt", "multiple-prefixes", true, cur_prefix, app_prefix_dir); - return ORTE_ERR_FATAL; + goto cleanup; } /* If not yet set, copy it; iff set, then it's the @@ -399,56 +405,11 @@ static int plm_slurm_launch_job(orte_job_t *jdata) goto cleanup; } - /* do NOT wait for srun to complete. Srun only completes when the processes - * it starts - in this case, the orteds - complete. Instead, we'll catch - * any srun failures and deal with them elsewhere - */ - - /* wait for daemons to callback */ - if (ORTE_SUCCESS != (rc = orte_plm_base_daemon_callback(map->num_new_daemons))) { - OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:slurm: daemon launch failed on error %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_ERROR_NAME(rc))); - goto cleanup; - } - - launch_apps: - /* get here if daemons launch okay - any failures now by apps */ - launching_daemons = false; - /* setup the job */ - if (ORTE_SUCCESS != (rc = orte_plm_base_setup_job(jdata))) { - ORTE_ERROR_LOG(rc); - failed_job = jdata->jobid; - goto cleanup; - } - failed_job = jdata->jobid; + /* indicate that the daemons for this job were launched */ + state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; - if (ORTE_SUCCESS != (rc = orte_plm_base_launch_apps(jdata->jobid))) { - OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:slurm: launch of apps failed for job %s on error %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(jdata->jobid), ORTE_ERROR_NAME(rc))); - goto cleanup; - } - - /* declare the launch a success */ + /* flag that launch was successful, so far as we currently know */ failed_launch = false; - - if (orte_timing) { - if (0 != gettimeofday(&launchstop, NULL)) { - opal_output(0, "plm_slurm: could not obtain stop time"); - } else { - opal_output(0, "plm_slurm: total job launch time is %ld usec", - (launchstop.tv_sec - launchstart.tv_sec)*1000000 + - (launchstop.tv_usec - launchstart.tv_usec)); - } - } - - if (ORTE_SUCCESS != rc) { - opal_output(0, "plm:slurm: start_procs returned error %d", rc); - goto cleanup; - } cleanup: if (NULL != argv) { @@ -462,20 +423,13 @@ static int plm_slurm_launch_job(orte_job_t *jdata) free(jobid_string); } + /* cleanup the caddy */ + OBJ_RELEASE(state); + /* check for failed launch - if so, force terminate */ if (failed_launch) { - if (ORTE_ERR_SILENT == rc) { - orte_errmgr.update_state(failed_job, ORTE_JOB_STATE_SILENT_ABORT, - NULL, ORTE_PROC_STATE_UNDEF, - 0, ORTE_ERROR_DEFAULT_EXIT_CODE); - } else { - orte_errmgr.update_state(failed_job, ORTE_JOB_STATE_FAILED_TO_START, - NULL, ORTE_PROC_STATE_UNDEF, - 0, ORTE_ERROR_DEFAULT_EXIT_CODE); - } + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); } - - return rc; } @@ -486,28 +440,27 @@ static int plm_slurm_terminate_orteds(void) { int rc; orte_job_t *jdata; - - /* tell them to die without sending a reply - we will rely on the - * waitpid to tell us when they have exited! - */ - if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_HALT_VM_CMD))) { - ORTE_ERROR_LOG(rc); - } - + /* check to see if the primary pid is set. If not, this indicates * that we never launched any additional daemons, so we cannot * not wait for a waitpid to fire and tell us it's okay to * exit. Instead, we simply trigger an exit for ourselves */ - if (!primary_pid_set) { + if (primary_pid_set) { + /* tell them to die without sending a reply - we will rely on the + * waitpid to tell us when they have exited! + */ + if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_HALT_VM_CMD))) { + ORTE_ERROR_LOG(rc); + } + } else { OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "%s plm:slurm: primary daemons complete!", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); - jdata->state = ORTE_JOB_STATE_TERMINATED; /* need to set the #terminated value to avoid an incorrect error msg */ jdata->num_terminated = jdata->num_procs; - orte_quit(); + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DAEMONS_TERMINATED); } return rc; @@ -566,14 +519,15 @@ static void srun_wait_cb(pid_t pid, int status, void* cbdata){ pid so nobody thinks this is real */ + jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); + /* if we are in the launch phase, then any termination is bad */ if (launching_daemons) { /* report that one or more daemons failed to launch so we can exit */ OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "%s plm:slurm: daemon failed during launch", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - orte_errmgr.update_state(ORTE_PROC_MY_NAME->jobid, ORTE_JOB_STATE_FAILED_TO_START, - NULL, ORTE_PROC_STATE_UNDEF, 0, status); + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_FAILED_TO_START); } else { /* if this is after launch, then we need to abort only if the status * returned is non-zero - i.e., if the orteds exited with an error @@ -585,8 +539,7 @@ static void srun_wait_cb(pid_t pid, int status, void* cbdata){ OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "%s plm:slurm: daemon failed while running", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - orte_errmgr.update_state(ORTE_PROC_MY_NAME->jobid, ORTE_JOB_STATE_ABORTED, - NULL, ORTE_PROC_STATE_UNDEF, 0, status); + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ABORTED); } /* otherwise, check to see if this is the primary pid */ if (primary_srun_pid == pid) { @@ -596,11 +549,9 @@ static void srun_wait_cb(pid_t pid, int status, void* cbdata){ OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "%s plm:slurm: primary daemons complete!", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); - jdata->state = ORTE_JOB_STATE_TERMINATED; /* need to set the #terminated value to avoid an incorrect error msg */ jdata->num_terminated = jdata->num_procs; - orte_quit(); + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DAEMONS_TERMINATED); } } } diff --git a/orte/mca/plm/tm/plm_tm_module.c b/orte/mca/plm/tm/plm_tm_module.c index 3f0996013a..0f755c3776 100644 --- a/orte/mca/plm/tm/plm_tm_module.c +++ b/orte/mca/plm/tm/plm_tm_module.c @@ -10,7 +10,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2007-2011 Los Alamos National Security, LLC. All rights + * Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ * @@ -70,6 +70,7 @@ #include "orte/runtime/orte_wait.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/rmaps/rmaps.h" +#include "orte/mca/state/state.h" #include "orte/mca/plm/plm.h" #include "orte/mca/plm/base/plm_private.h" @@ -78,7 +79,7 @@ /* - * Local functions + * API functions */ static int plm_tm_init(void); static int plm_tm_launch_job(orte_job_t *jdata); @@ -86,14 +87,11 @@ static int plm_tm_terminate_orteds(void); static int plm_tm_signal_job(orte_jobid_t jobid, int32_t signal); static int plm_tm_finalize(void); -static int plm_tm_connect(void); -static int plm_tm_disconnect(void); -static void failed_start(int fd, short event, void *arg); - /* * Local "global" variables */ -static opal_event_t *ev=NULL; +static orte_std_cntr_t launched = 0; +static bool connected = false; /* * Global variable @@ -110,6 +108,14 @@ orte_plm_base_module_t orte_plm_tm_module = { plm_tm_finalize }; +/* Local functions */ +static int plm_tm_connect(void); +static int plm_tm_disconnect(void); +static void failed_start(int fd, short event, void *arg); +static void launch_daemons(int fd, short args, void *cbdata); +static void poll_spawns(int fd, short args, void *cbdata); + + /** * Init the module */ @@ -124,15 +130,43 @@ static int plm_tm_init(void) /* we assign daemon nodes at launch */ orte_plm_globals.daemon_nodes_assigned_at_launch = true; + /* point to our launch command */ + if (ORTE_SUCCESS != (rc = orte_state.add_job_state(ORTE_JOB_STATE_LAUNCH_DAEMONS, + launch_daemons, ORTE_SYS_PRI))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + /* overwrite the daemons_launched state to point to + * our own local function + */ + if (ORTE_SUCCESS != (rc = orte_state.set_job_state_callback(ORTE_JOB_STATE_DAEMONS_LAUNCHED, + poll_spawns))) { + ORTE_ERROR_LOG(rc); + return rc; + } + return rc; } +static int plm_tm_launch_job(orte_job_t *jdata) +{ + if (ORTE_JOB_CONTROL_RESTART & jdata->controls) { + /* this is a restart situation - skip to the mapping stage */ + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP); + } else { + /* new job - set it up */ + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_INIT); + } + return ORTE_SUCCESS; +} + /* When working in this function, ALWAYS jump to "cleanup" if * you encounter an error so that orterun will be woken up and * the job can cleanly terminate */ -static int plm_tm_launch_job(orte_job_t *jdata) +static void launch_daemons(int fd, short args, void *cbdata) { orte_job_map_t *map = NULL; orte_app_context_t *app; @@ -145,43 +179,29 @@ static int plm_tm_launch_job(orte_job_t *jdata) char **nodeargv; int argc = 0; int rc; - bool connected = false; - orte_std_cntr_t launched = 0, i; + orte_std_cntr_t i; char *bin_base = NULL, *lib_base = NULL; tm_event_t *tm_events = NULL; tm_task_id *tm_task_ids = NULL; - int local_err; - tm_event_t event; bool failed_launch = true; mode_t current_umask; - orte_jobid_t failed_job; char *nodelist; char* vpid_string; - orte_job_t *daemons; + orte_job_t *daemons, *jdata; + orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata; + + jdata = state->jdata; /* if we are launching debugger daemons, then just go * do it - no new daemons will be launched */ if (ORTE_JOB_CONTROL_DEBUGGER_DAEMON & jdata->controls) { - failed_job = jdata->jobid; - if (ORTE_SUCCESS != (rc = orte_plm_base_launch_apps(jdata->jobid))) { - ORTE_ERROR_LOG(rc); - failed_launch = true; - } else { - failed_launch = false; - } - goto cleanup; + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DAEMONS_REPORTED); + OBJ_RELEASE(state); + return; } - /* if we are timing, record the start time */ - if (orte_timing) { - gettimeofday(&orte_plm_globals.daemonlaunchstart, NULL); - } - - /* default to declaring the daemons as failed */ - failed_job = ORTE_PROC_MY_NAME->jobid; - - /* start by launching the virtual machine */ + /* setup the virtual machine */ daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); if (ORTE_SUCCESS != (rc = orte_plm_base_setup_virtual_machine(jdata))) { ORTE_ERROR_LOG(rc); @@ -193,12 +213,15 @@ static int plm_tm_launch_job(orte_job_t *jdata) * look at the proposed process map */ if (orte_do_not_launch) { - goto launch_apps; + /* set the state to indicate the daemons reported - this + * will trigger the daemons_reported event and cause the + * job to move to the following step + */ + jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; + ORTE_ACTIVATE_JOB_STATE(daemons, ORTE_JOB_STATE_DAEMONS_REPORTED); + OBJ_RELEASE(state); + return; } - - OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:tm: launching vm", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* Get the map for this job */ if (NULL == (map = daemons->map)) { @@ -208,10 +231,20 @@ static int plm_tm_launch_job(orte_job_t *jdata) } if (0 == map->num_new_daemons) { - /* have all the daemons we need - launch app */ - goto launch_apps; + /* set the state to indicate the daemons reported - this + * will trigger the daemons_reported event and cause the + * job to move to the following step + */ + jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; + ORTE_ACTIVATE_JOB_STATE(daemons, ORTE_JOB_STATE_DAEMONS_REPORTED); + OBJ_RELEASE(state); + return; } + OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, + "%s plm:tm: launching vm", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + /* Allocate a bunch of TM events to use for tm_spawn()ing */ tm_events = malloc(sizeof(tm_event_t) * map->num_new_daemons); if (NULL == tm_events) { @@ -372,30 +405,51 @@ static int plm_tm_launch_job(orte_job_t *jdata) } launched++; - - /* Allow some progress to occur */ - opal_event_loop(opal_event_base, OPAL_EVLOOP_NONBLOCK); } + /* indicate that the daemons for this job were launched */ + state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; + + /* flag that launch was successful, so far as we currently know */ + failed_launch = false; + OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "%s plm:tm:launch: finished spawning orteds", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + cleanup: + /* cleanup */ + OBJ_RELEASE(state); + + /* check for failed launch - if so, force terminate */ + if (failed_launch) { + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); + } +} + +static void poll_spawns(int fd, short args, void *cbdata) +{ + orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata; + int i, rc; + bool failed_launch = true; + int local_err; + tm_event_t event; + /* TM poll for all the spawns */ for (i = 0; i < launched; ++i) { rc = tm_poll(TM_NULL_EVENT, &event, 1, &local_err); if (TM_SUCCESS != rc) { - errno = local_err; opal_output(0, "plm:tm: failed to poll for a spawned daemon, return status = %d", rc); goto cleanup; } if (TM_SUCCESS != local_err) { - errno = local_err; - opal_output(0, "plm:tm: failed to spawn daemon, error code = %d", errno ); + opal_output(0, "plm:tm: failed to spawn daemon, error code = %d", local_err ); goto cleanup; } } - + failed_launch = false; + +#if 0 /* set a timer to tell us if one or more daemon's fails to start - use the * millisec/daemon timeout provided by the user to compute time */ @@ -404,91 +458,20 @@ static int plm_tm_launch_job(orte_job_t *jdata) "%s plm:tm: setting startup timer for %d milliseconds", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), orte_startup_timeout)); - ORTE_DETECT_TIMEOUT(&ev, map->num_new_daemons, + ORTE_DETECT_TIMEOUT(map->num_new_daemons, orte_startup_timeout*1000, - -1, failed_start); + -1, failed_start, state->jdata); } - - /* wait for daemons to callback */ - if (ORTE_SUCCESS != (rc = orte_plm_base_daemon_callback(map->num_new_daemons))) { - OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:tm: daemon launch failed on error %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_ERROR_NAME(rc))); - goto cleanup; - } - - /* if issued, cancel the failed-to-start timer */ - if (NULL != ev) { - opal_event_del(ev); - } - -launch_apps: - /* setup the job */ - if (ORTE_SUCCESS != (rc = orte_plm_base_setup_job(jdata))) { - ORTE_ERROR_LOG(rc); - failed_job = jdata->jobid; - goto cleanup; - } - /* since the daemons have launched, any failures now will be for the - * application job - */ - failed_job = jdata->jobid; - if (ORTE_SUCCESS != (rc = orte_plm_base_launch_apps(jdata->jobid))) { - OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:tm: launch of apps failed for job %s on error %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(jdata->jobid), ORTE_ERROR_NAME(rc))); - goto cleanup; - } - - /* if we get here, then everything launched okay - record that fact */ - failed_launch = false; - +#endif cleanup: - if (NULL != argv) { - opal_argv_free(argv); - } - if (NULL != env) { - opal_argv_free(env); - } - - if (connected) { - plm_tm_disconnect(); - } - if (NULL != tm_events) { - free(tm_events); - } - if (NULL != tm_task_ids) { - free(tm_task_ids); - } - - if (NULL != lib_base) { - free(lib_base); - } - if (NULL != bin_base) { - free(bin_base); - } + /* cleanup */ + OBJ_RELEASE(state); /* check for failed launch - if so, force terminate */ if (failed_launch) { - if (ORTE_ERR_SILENT == rc) { - orte_errmgr.update_state(failed_job, ORTE_JOB_STATE_SILENT_ABORT, - NULL, ORTE_PROC_STATE_UNDEF, - 0, ORTE_ERROR_DEFAULT_EXIT_CODE); - } else { - orte_errmgr.update_state(failed_job, ORTE_JOB_STATE_FAILED_TO_START, - NULL, ORTE_PROC_STATE_UNDEF, - 0, ORTE_ERROR_DEFAULT_EXIT_CODE); - } + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); } - - OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:tm:launch: finished", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - return rc; } @@ -545,6 +528,11 @@ static int plm_tm_finalize(void) ORTE_ERROR_LOG(rc); } + if (connected) { + tm_finalize(); + connected = false; + } + return ORTE_SUCCESS; } @@ -573,33 +561,3 @@ static int plm_tm_connect(void) return ORTE_ERR_RESOURCE_BUSY; } - - -static int plm_tm_disconnect(void) -{ - tm_finalize(); - - return ORTE_SUCCESS; -} - -/* call this function if the timer fires indicating that one - * or more daemons failed to start - */ -static void failed_start(int fd, short dummy, void *arg) -{ - OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:tm:failed_start", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - /* if we are aborting, ignore this */ - if (orte_abnormal_term_ordered) { - OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, - "%s plm:tm:failed_start - abnormal term in progress", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - return; - } - - orte_errmgr.update_state(ORTE_PROC_MY_NAME->jobid, ORTE_JOB_STATE_FAILED_TO_START, - NULL, ORTE_PROC_STATE_UNDEF, - 0, ORTE_ERROR_DEFAULT_EXIT_CODE); -} diff --git a/orte/mca/ras/base/base.h b/orte/mca/ras/base/base.h index 4eb07f59e1..801504e725 100644 --- a/orte/mca/ras/base/base.h +++ b/orte/mca/ras/base/base.h @@ -9,6 +9,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -45,11 +47,11 @@ typedef struct orte_ras_base_t { bool allocation_read; bool display_alloc; orte_ras_base_module_t *active_module; + int total_slots_alloc; } orte_ras_base_t; ORTE_DECLSPEC extern orte_ras_base_t orte_ras_base; - /* * function definitions */ @@ -57,6 +59,8 @@ ORTE_DECLSPEC int orte_ras_base_select(void); ORTE_DECLSPEC int orte_ras_base_finalize(void); ORTE_DECLSPEC int orte_ras_base_close(void); +ORTE_DECLSPEC void orte_ras_base_allocate(int fd, short args, void *cbdata); + ORTE_DECLSPEC int orte_ras_base_add_hosts(orte_job_t *jdata); #endif /* ORTE_DISABLE_FULL_SUPPORT */ diff --git a/orte/mca/ras/base/ras_base_allocate.c b/orte/mca/ras/base/ras_base_allocate.c index 99316bffbe..1f6972e5a0 100644 --- a/orte/mca/ras/base/ras_base_allocate.c +++ b/orte/mca/ras/base/ras_base_allocate.c @@ -9,6 +9,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -41,6 +43,7 @@ #include "orte/util/dash_host/dash_host.h" #include "orte/util/proc_info.h" #include "orte/util/comm/comm.h" +#include "orte/mca/state/state.h" #include "orte/runtime/orte_quit.h" #include "orte/mca/ras/base/ras_private.h" @@ -85,18 +88,23 @@ static void display_alloc(void) * Function for selecting one component from all those that are * available. */ -int orte_ras_base_allocate(orte_job_t *jdata) +void orte_ras_base_allocate(int fd, short args, void *cbdata) { int rc; + orte_job_t *jdata; opal_list_t nodes; orte_node_t *node; orte_std_cntr_t i; orte_app_context_t *app; + orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; OPAL_OUTPUT_VERBOSE((5, orte_ras_base.ras_output, "%s ras:base:allocate", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + /* convenience */ + jdata = caddy->jdata; + /* if we already did this, don't do it again - the pool of * global resources is set. */ @@ -105,16 +113,10 @@ int orte_ras_base_allocate(orte_job_t *jdata) OPAL_OUTPUT_VERBOSE((5, orte_ras_base.ras_output, "%s ras:base:allocate allocation already read", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - /* loop through the global node pool and set the - * number of allocated slots to the difference - * between slots and slots_in_use. Note that - * oversubscription will still allow procs to - * be mapped up to slots_max - */ - return ORTE_SUCCESS; + goto next_state; } - + orte_ras_base.allocation_read = true; + /* Otherwise, we have to create * the initial set of resources that will delineate all * further operations serviced by this HNP. This list will @@ -124,11 +126,6 @@ int orte_ras_base_allocate(orte_job_t *jdata) * no job launched by this HNP will be able to utilize it. */ - /* note that the allocation has been read so we don't - * come in here again! - */ - orte_ras_base.allocation_read = true; - /* construct a list to hold the results */ OBJ_CONSTRUCT(&nodes, opal_list_t); @@ -147,18 +144,22 @@ int orte_ras_base_allocate(orte_job_t *jdata) } ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&nodes); - return rc; + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); + OBJ_RELEASE(caddy); + return; } } /* If something came back, save it and we are done */ if (!opal_list_is_empty(&nodes)) { /* store the results in the global resource pool - this removes the - * list items - */ + * list items + */ if (ORTE_SUCCESS != (rc = orte_ras_base_node_insert(&nodes, jdata))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&nodes); - return rc; + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); + OBJ_RELEASE(caddy); + return; } OBJ_DESTRUCT(&nodes); /* default to no-oversubscribe-allowed for managed systems */ @@ -172,9 +173,9 @@ int orte_ras_base_allocate(orte_job_t *jdata) */ OBJ_DESTRUCT(&nodes); orte_show_help("help-ras-base.txt", "ras-base:no-allocation", true); - ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); - orte_jobs_complete(); - return ORTE_ERROR; + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); + OBJ_RELEASE(caddy); + return; } @@ -203,7 +204,9 @@ int orte_ras_base_allocate(orte_job_t *jdata) orte_default_hostfile))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&nodes); - return rc; + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); + OBJ_RELEASE(caddy); + return; } } /* if something was found in the default hostfile, we use that as our global @@ -215,6 +218,9 @@ int orte_ras_base_allocate(orte_job_t *jdata) */ if (ORTE_SUCCESS != (rc = orte_ras_base_node_insert(&nodes, jdata))) { ORTE_ERROR_LOG(rc); + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); + OBJ_RELEASE(caddy); + return; } /* cleanup */ OBJ_DESTRUCT(&nodes); @@ -246,10 +252,13 @@ int orte_ras_base_allocate(orte_job_t *jdata) /* hostfile was specified - parse it and add it to the list */ if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&nodes, - app->hostfile))) { + app->hostfile))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&nodes); - return rc; + /* set an error event */ + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); + OBJ_RELEASE(caddy); + return; } } } @@ -263,14 +272,15 @@ int orte_ras_base_allocate(orte_job_t *jdata) */ if (ORTE_SUCCESS != (rc = orte_ras_base_node_insert(&nodes, jdata))) { ORTE_ERROR_LOG(rc); + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); + OBJ_RELEASE(caddy); + return; } /* cleanup */ OBJ_DESTRUCT(&nodes); goto DISPLAY; } - - OPAL_OUTPUT_VERBOSE((5, orte_ras_base.ras_output, "%s ras:base:allocate nothing found in hostfiles - checking dash-host options", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); @@ -297,10 +307,12 @@ int orte_ras_base_allocate(orte_job_t *jdata) } if (NULL != app->dash_host) { if (ORTE_SUCCESS != (rc = orte_util_add_dash_host_nodes(&nodes, - app->dash_host))) { + app->dash_host))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&nodes); - return rc; + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); + OBJ_RELEASE(caddy); + return; } } } @@ -314,12 +326,15 @@ int orte_ras_base_allocate(orte_job_t *jdata) */ if (ORTE_SUCCESS != (rc = orte_ras_base_node_insert(&nodes, jdata))) { ORTE_ERROR_LOG(rc); + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); + OBJ_RELEASE(caddy); + return; } /* cleanup */ OBJ_DESTRUCT(&nodes); goto DISPLAY; } - + OPAL_OUTPUT_VERBOSE((5, orte_ras_base.ras_output, "%s ras:base:allocate nothing found in dash-host - checking for rankfile", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); @@ -333,7 +348,9 @@ int orte_ras_base_allocate(orte_job_t *jdata) orte_rankfile))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&nodes); - return rc; + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); + OBJ_RELEASE(caddy); + return ; } } /* if something was found in rankfile, we use that as our global @@ -345,6 +362,13 @@ int orte_ras_base_allocate(orte_job_t *jdata) */ if (ORTE_SUCCESS != (rc = orte_ras_base_node_insert(&nodes, jdata))) { ORTE_ERROR_LOG(rc); + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); + OBJ_RELEASE(caddy); + return; + } + /* rankfile is considered equivalent to an RM allocation */ + if (!(ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) { + ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE); } /* cleanup */ OBJ_DESTRUCT(&nodes); @@ -356,7 +380,7 @@ int orte_ras_base_allocate(orte_job_t *jdata) "%s ras:base:allocate nothing found in rankfile - inserting current node", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); -addlocal: + addlocal: /* if nothing was found by any of the above methods, then we have no * earthly idea what to do - so just add the local host */ @@ -364,7 +388,9 @@ addlocal: if (NULL == node) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); OBJ_DESTRUCT(&nodes); - return ORTE_ERR_OUT_OF_RESOURCE; + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); + OBJ_RELEASE(caddy); + return; } /* use the same name we got in orte_process_info so we avoid confusion in * the session directories @@ -382,23 +408,36 @@ addlocal: if (ORTE_SUCCESS != (rc = orte_ras_base_node_insert(&nodes, jdata))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&nodes); - return rc; + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); + OBJ_RELEASE(caddy); + return; } OBJ_DESTRUCT(&nodes); -DISPLAY: + DISPLAY: + /* shall we display the results? */ + if (0 < opal_output_get_verbosity(orte_ras_base.ras_output) || orte_ras_base.display_alloc) { + display_alloc(); + } + + next_state: /* are we to report this event? */ if (orte_report_events) { if (ORTE_SUCCESS != (rc = orte_util_comm_report_event(ORTE_COMM_EVENT_ALLOCATE))) { ORTE_ERROR_LOG(rc); + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); + OBJ_RELEASE(caddy); } } - /* shall we display the results? */ - if (orte_ras_base.display_alloc) { - display_alloc(); - } - return rc; + /* set total slots alloc */ + jdata->total_slots_alloc = orte_ras_base.total_slots_alloc; + + /* set the job state to the next position */ + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_LAUNCH_DAEMONS); + + /* cleanup */ + OBJ_RELEASE(caddy); } int orte_ras_base_add_hosts(orte_job_t *jdata) @@ -480,7 +519,7 @@ int orte_ras_base_add_hosts(orte_job_t *jdata) } /* shall we display the results? */ - if (orte_ras_base.display_alloc) { + if (0 < opal_output_get_verbosity(orte_ras_base.ras_output) || orte_ras_base.display_alloc) { display_alloc(); } diff --git a/orte/mca/ras/base/ras_base_node.c b/orte/mca/ras/base/ras_base_node.c index 5afb4a5e19..7290ed2115 100644 --- a/orte/mca/ras/base/ras_base_node.c +++ b/orte/mca/ras/base/ras_base_node.c @@ -9,6 +9,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow diff --git a/orte/mca/ras/base/ras_base_open.c b/orte/mca/ras/base/ras_base_open.c index 831928254a..af13b9e61a 100644 --- a/orte/mca/ras/base/ras_base_open.c +++ b/orte/mca/ras/base/ras_base_open.c @@ -9,6 +9,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -25,6 +27,7 @@ #include "opal/mca/mca.h" #include "opal/mca/base/base.h" #include "opal/mca/base/mca_base_param.h" +#include "opal/mca/event/event.h" #include "orte/mca/ras/base/ras_private.h" @@ -61,13 +64,8 @@ int orte_ras_base_open(void) /* * Global variables */ -orte_ras_t orte_ras = { - orte_ras_base_allocate -}; - orte_ras_base_t orte_ras_base; - /** * Function for finding and opening either all MCA components, or the one * that was specifically requested via a MCA parameter. @@ -80,7 +78,8 @@ int orte_ras_base_open(void) /* set default flags */ orte_ras_base.active_module = NULL; orte_ras_base.allocation_read = false; - + orte_ras_base.total_slots_alloc = 0; + /* should we display the allocation after determining it? */ mca_base_param_reg_int_name("ras", "base_display_alloc", "Whether to display the allocation after it is determined", diff --git a/orte/mca/ras/base/ras_private.h b/orte/mca/ras/base/ras_private.h index 8640e6f03a..1714c00997 100644 --- a/orte/mca/ras/base/ras_private.h +++ b/orte/mca/ras/base/ras_private.h @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2008 UT-Battelle, LLC. All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -39,76 +41,11 @@ BEGIN_C_DECLS -/* - * API function definitions - */ -ORTE_DECLSPEC int orte_ras_base_allocate(orte_job_t *jdata); - /** * Add the specified node definitions to the registry */ ORTE_DECLSPEC int orte_ras_base_node_insert(opal_list_t*, orte_job_t*); -#if 0 -/* - * Internal support functions - */ -ORTE_DECLSPEC int orte_ras_base_allocate_nodes(orte_jobid_t jobid, - opal_list_t* nodes); - -ORTE_DECLSPEC int orte_ras_base_reallocate(orte_jobid_t parent_jobid, - orte_jobid_t child_jobid); - -ORTE_DECLSPEC int orte_ras_base_set_oversubscribe_override(orte_jobid_t job); - -ORTE_DECLSPEC int orte_ras_base_hostfile_query(char *hostfile); - -ORTE_DECLSPEC int orte_ras_base_get_oversubscribe_override(orte_jobid_t job, bool *flag); - -ORTE_DECLSPEC int orte_ras_base_read_nodename_file(opal_list_t *nodes, char *filename); - -/* - * Query the registry for all available nodes - */ -ORTE_DECLSPEC int orte_ras_base_node_query(opal_list_t*); - -/* - * Query the registry for a specific node - */ -ORTE_DECLSPEC orte_ras_node_t* orte_ras_base_node_lookup(const char* nodename); - -/** - * Query the registry for all nodes allocated to a specific job - */ -ORTE_DECLSPEC int orte_ras_base_node_query_alloc(opal_list_t*, orte_jobid_t); - -ORTE_DECLSPEC int orte_ras_base_proc_query_alloc(opal_list_t* procs); - -/** - * Add the specified node definitions to the registry - */ -ORTE_DECLSPEC int orte_ras_base_node_insert(opal_list_t*); - -ORTE_DECLSPEC int orte_ras_base_proc_insert(opal_list_t* procs, orte_jobid_t jobid); - -/** - * Delete the specified nodes from the registry - */ -ORTE_DECLSPEC int orte_ras_base_node_delete(opal_list_t*); - -/** - * Assign the allocated slots on the specified nodes to the - * indicated jobid. - */ -ORTE_DECLSPEC int orte_ras_base_node_assign(opal_list_t*, orte_jobid_t); - -/** - * Check to see if the node segment is empty - */ -ORTE_DECLSPEC int orte_ras_base_node_segment_empty(bool *empty); - -#endif - END_C_DECLS #endif diff --git a/orte/mca/ras/ras.h b/orte/mca/ras/ras.h index bb9d3861eb..797a7a206d 100644 --- a/orte/mca/ras/ras.h +++ b/orte/mca/ras/ras.h @@ -9,6 +9,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -161,6 +163,7 @@ #include "orte/types.h" #include "opal/mca/mca.h" +#include "opal/mca/event/event.h" #include "opal/class/opal_list.h" #include "orte/runtime/orte_globals.h" @@ -169,16 +172,10 @@ BEGIN_C_DECLS -/* define the API functions */ -typedef int (*orte_ras_base_API_allocate_fn_t)(orte_job_t *jdata); - -/* global structure for accessing RAS API's */ -typedef struct { - orte_ras_base_API_allocate_fn_t allocate; -} orte_ras_t; - -ORTE_DECLSPEC extern orte_ras_t orte_ras; - +/* allocation event - the event one activates to schedule resource + * allocation for pending jobs + */ +ORTE_DECLSPEC extern opal_event_t orte_allocate_event; /* * ras module functions - these are not accessible to the outside world, diff --git a/orte/mca/rmaps/base/Makefile.am b/orte/mca/rmaps/base/Makefile.am index ae0dba67f5..775d4c1e22 100644 --- a/orte/mca/rmaps/base/Makefile.am +++ b/orte/mca/rmaps/base/Makefile.am @@ -10,8 +10,8 @@ # Copyright (c) 2004-2005 The Regents of the University of California. # All rights reserved. # Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. -# Copyright (c) 2011 Los Alamos National Security, LLC. All rights -# reserved. +# Copyright (c) 2011 Los Alamos National Security, LLC. +# All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow diff --git a/orte/mca/rmaps/base/base.h b/orte/mca/rmaps/base/base.h index cf377c7237..c76b8f73f2 100644 --- a/orte/mca/rmaps/base/base.h +++ b/orte/mca/rmaps/base/base.h @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -90,6 +92,11 @@ OBJ_CLASS_DECLARATION(orte_rmaps_base_selected_module_t); ORTE_DECLSPEC int orte_rmaps_base_select(void); +/* + * Map a job + */ +ORTE_DECLSPEC void orte_rmaps_base_map_job(int fd, short args, void *cbdata); + /** * Utility routines to get/set vpid mapping for the job */ diff --git a/orte/mca/rmaps/base/rmaps_base_close.c b/orte/mca/rmaps/base/rmaps_base_close.c index 56ebb73043..26ffd4a013 100644 --- a/orte/mca/rmaps/base/rmaps_base_close.c +++ b/orte/mca/rmaps/base/rmaps_base_close.c @@ -9,6 +9,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow diff --git a/orte/mca/rmaps/base/rmaps_base_get_job_map.c b/orte/mca/rmaps/base/rmaps_base_get_job_map.c deleted file mode 100644 index e84340d890..0000000000 --- a/orte/mca/rmaps/base/rmaps_base_get_job_map.c +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "orte_config.h" -#include "orte/constants.h" - -#include - -#include "orte/mca/errmgr/errmgr.h" -#include "orte/runtime/orte_globals.h" - -#include "orte/mca/rmaps/base/rmaps_private.h" - - -/* - * Function to return a pointer to a job map - */ -orte_job_map_t* orte_rmaps_base_get_job_map(orte_jobid_t job) -{ - orte_job_t *jdata; - orte_job_map_t *map; - orte_job_t *daemons; - orte_proc_t *proc; - orte_std_cntr_t i; - orte_node_t *node; - - /* lookup the job's data object */ - if (NULL == (jdata = orte_get_job_data_object(job))) { - /* bad jobid */ - ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); - return NULL; - } - - /* locate the map */ - map = jdata->map; - - /* lookup the daemon's job data struct */ - if (NULL == (daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) { - /* bad jobid */ - ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); - return NULL; - } - - /* check to see if daemons have been launched since we - * last updated the map - */ - for (i=0; i < map->nodes->size; i++) { - if (NULL != map->nodes->addr[i]) { - node = (orte_node_t*)map->nodes->addr[i]; - if (NULL != node->daemon) { - if (daemons->procs->size < (orte_std_cntr_t)node->daemon->name.vpid) { - /* well that is bad */ - ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); - return NULL; - } - /* find the daemon's info */ - proc = (orte_proc_t*)daemons->procs->addr[node->daemon->name.vpid]; - if (NULL == proc) { - /* well that is bad too */ - ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); - return NULL; - } - if (NULL != proc->rml_uri) { - node->daemon_launched = true; - } else { - node->daemon_launched = false; - } - } - } - } - - return map; -} diff --git a/orte/mca/rmaps/base/rmaps_base_map_job.c b/orte/mca/rmaps/base/rmaps_base_map_job.c index f6ba00ed9e..37eb8de0aa 100644 --- a/orte/mca/rmaps/base/rmaps_base_map_job.c +++ b/orte/mca/rmaps/base/rmaps_base_map_job.c @@ -9,7 +9,9 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -31,6 +33,7 @@ #include "orte/mca/errmgr/errmgr.h" #include "orte/runtime/orte_globals.h" #include "orte/util/show_help.h" +#include "orte/mca/state/state.h" #include "orte/mca/rmaps/base/base.h" #include "orte/mca/rmaps/base/rmaps_private.h" @@ -40,13 +43,19 @@ * Function for selecting one component from all those that are * available. */ -int orte_rmaps_base_map_job(orte_job_t *jdata) +void orte_rmaps_base_map_job(int fd, short args, void *cbdata) { + orte_job_t *jdata; orte_job_map_t *map; int rc; bool did_map; opal_list_item_t *item; orte_rmaps_base_selected_module_t *mod; + orte_job_t *parent; + orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; + + /* convenience */ + jdata = caddy->jdata; /* NOTE: NO PROXY COMPONENT REQUIRED - REMOTE PROCS ARE NOT * ALLOWED TO CALL RMAPS INDEPENDENTLY. ONLY THE PLM CAN @@ -64,21 +73,18 @@ int orte_rmaps_base_map_job(orte_job_t *jdata) * NULL MAP FIELD * LONE EXCEPTION - WE COPY DISPLAY MAP ACROSS IF THEY * DIDN'T SET IT - */ - + */ if (NULL == jdata->map) { - /* a map has not been defined yet for this job, so set one - * up here - */ opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps: creating new map for job %s", ORTE_JOBID_PRINT(jdata->jobid)); - /* create a map object where we will store the results */ map = OBJ_NEW(orte_job_map_t); if (NULL == map) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - return ORTE_ERR_OUT_OF_RESOURCE; + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); + OBJ_RELEASE(caddy); + return; } /* load it with the system defaults */ map->mapping = orte_rmaps_base.mapping; @@ -119,6 +125,28 @@ int orte_rmaps_base_map_job(orte_job_t *jdata) #endif } +#if OPAL_HAVE_HWLOC + /* if we are not going to launch, then we need to set any + * undefined topologies to match our own so the mapper + * can operate + */ + if (orte_do_not_launch) { + orte_node_t *node; + hwloc_topology_t t0; + int i; + node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0); + t0 = node->topology; + for (i=1; i < orte_node_pool->size; i++) { + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) { + continue; + } + if (NULL == node->topology) { + node->topology = t0; + } + } + } +#endif + /* cycle thru the available mappers until one agrees to map * the job */ @@ -136,7 +164,9 @@ int orte_rmaps_base_map_job(orte_job_t *jdata) */ if (ORTE_ERR_TAKE_NEXT_OPTION != rc) { ORTE_ERROR_LOG(rc); - return rc; + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); + OBJ_RELEASE(caddy); + return; } } /* if we get here without doing the map, or with zero procs in @@ -144,23 +174,36 @@ int orte_rmaps_base_map_job(orte_job_t *jdata) */ if (!did_map || 0 == jdata->num_procs) { orte_show_help("help-orte-rmaps-base.txt", "failed-map", true); - return ORTE_ERR_FAILED_TO_MAP; + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); + OBJ_RELEASE(caddy); + return; } /* compute and save local ranks */ if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) { ORTE_ERROR_LOG(rc); - return rc; + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); + OBJ_RELEASE(caddy); + return; } #if OPAL_HAVE_HWLOC /* compute and save bindings */ if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_bindings(jdata))) { ORTE_ERROR_LOG(rc); - return rc; + ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); + OBJ_RELEASE(caddy); + return; } #endif + /* if it is a dynamic spawn, save the bookmark on the parent's job too */ + if (ORTE_JOBID_INVALID != jdata->originator.jobid) { + if (NULL != (parent = orte_get_job_data_object(jdata->originator.jobid))) { + parent->bookmark = jdata->bookmark; + } + } + /* if we wanted to display the map, now is the time to do it - ignore * daemon job */ @@ -258,6 +301,9 @@ int orte_rmaps_base_map_job(orte_job_t *jdata) free(output); } } - - return ORTE_SUCCESS; + /* set the job state to the next position */ + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_SYSTEM_PREP); + + /* cleanup */ + OBJ_RELEASE(caddy); } diff --git a/orte/mca/rmaps/base/rmaps_base_open.c b/orte/mca/rmaps/base/rmaps_base_open.c index 21d9f663fe..39d323500c 100644 --- a/orte/mca/rmaps/base/rmaps_base_open.c +++ b/orte/mca/rmaps/base/rmaps_base_open.c @@ -10,8 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2006-2011 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2011 Los Alamos National Security, LLC. All rights - * reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -68,14 +68,6 @@ int orte_rmaps_base_open(void) */ orte_rmaps_base_t orte_rmaps_base; -/* - * Declare the RMAPS module to hold the API function pointers - */ -orte_rmaps_t orte_rmaps = { - orte_rmaps_base_map_job, -}; - - /** * Function for finding and opening either all MCA components, or the one * that was specifically requested via a MCA parameter. diff --git a/orte/mca/rmaps/base/rmaps_base_ranking.c b/orte/mca/rmaps/base/rmaps_base_ranking.c index a7007d9f92..0bbf741f1b 100644 --- a/orte/mca/rmaps/base/rmaps_base_ranking.c +++ b/orte/mca/rmaps/base/rmaps_base_ranking.c @@ -145,13 +145,7 @@ static int rank_span(orte_job_t *jdata, "mca:rmaps:rank_span: assigning vpid %s", ORTE_VPID_PRINT(vpid)); proc->name.vpid = vpid++; cnt++; - ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID); - ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name)); - /* If there is an invalid epoch here, it's because it doesn't exist yet. */ - if (0 == ORTE_EPOCH_CMP(ORTE_EPOCH_INVALID,proc->name.epoch)) { - ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN); - } if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) { ORTE_ERROR_LOG(rc); return rc; @@ -257,13 +251,7 @@ static int rank_fill(orte_job_t *jdata, "mca:rmaps:rank_fill: assigning vpid %s", ORTE_VPID_PRINT(vpid)); proc->name.vpid = vpid++; cnt++; - ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID); - ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name)); - /* If there is an invalid epoch here, it's because it doesn't exist yet. */ - if (0 == ORTE_EPOCH_CMP(ORTE_EPOCH_INVALID,proc->name.epoch)) { - ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN); - } if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) { ORTE_ERROR_LOG(rc); return rc; @@ -388,14 +376,8 @@ static int rank_by(orte_job_t *jdata, cnt++; opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps:rank_by: assigned rank %s", ORTE_VPID_PRINT(proc->name.vpid)); - ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID); - ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name)); - - /* If there is an invalid epoch here, it's because it doesn't exist yet. */ - if (0 == ORTE_EPOCH_CMP(ORTE_EPOCH_INVALID,proc->name.epoch)) { - ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN); - } - /* flag that one was mapped */ + + /* flag that one was mapped */ all_done = false; /* track where the highest vpid landed - this is our * new bookmark @@ -465,8 +447,6 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata, continue; } proc->name.vpid = vpid++; - ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID); - ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name)); /* insert the proc into the jdata->procs array - can't already * be there as the only way to this point in the code is for the * vpid to have been INVALID @@ -515,14 +495,7 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata, "mca:rmaps:base: assigning rank %s to node %s", ORTE_VPID_PRINT(vpid), node->name); proc->name.vpid = vpid++; - ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID); - ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name)); - - /* If there is an invalid epoch here, it's because it doesn't exist yet. */ - if (0 == ORTE_EPOCH_CMP(ORTE_EPOCH_INVALID,proc->name.epoch)) { - ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN); - } - /* track where the highest vpid landed - this is our + /* track where the highest vpid landed - this is our * new bookmark */ jdata->bookmark = node; diff --git a/orte/mca/rmaps/base/rmaps_base_support_fns.c b/orte/mca/rmaps/base/rmaps_base_support_fns.c index 9cd3301123..8f690af968 100644 --- a/orte/mca/rmaps/base/rmaps_base_support_fns.c +++ b/orte/mca/rmaps/base/rmaps_base_support_fns.c @@ -357,11 +357,6 @@ orte_proc_t* orte_rmaps_base_setup_proc(orte_job_t *jdata, proc = OBJ_NEW(orte_proc_t); /* set the jobid */ proc->name.jobid = jdata->jobid; - /* we do not set the vpid here - this will be done - * during a second phase, but we do set the epoch here - * since they all start with the same value. - */ - ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN); /* flag the proc as ready for launch */ proc->state = ORTE_PROC_STATE_INIT; proc->app_idx = idx; diff --git a/orte/mca/rmaps/base/rmaps_private.h b/orte/mca/rmaps/base/rmaps_private.h index 5531aefc28..d7ae9ae349 100644 --- a/orte/mca/rmaps/base/rmaps_private.h +++ b/orte/mca/rmaps/base/rmaps_private.h @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -38,15 +40,6 @@ BEGIN_C_DECLS * Base API functions */ -/* - * Map a job - * All calls to rmaps.map_job are routed through this function. This allows callers to - * the RMAPS framework to specify the particular mapper they wish to use. - */ -ORTE_DECLSPEC int orte_rmaps_base_map_job(orte_job_t *jdata); -ORTE_DECLSPEC orte_job_map_t* orte_rmaps_base_get_job_map(orte_jobid_t job); - - /* LOCAL FUNCTIONS for use by RMAPS components */ ORTE_DECLSPEC int orte_rmaps_base_get_target_nodes(opal_list_t* node_list, diff --git a/orte/mca/rmaps/ppr/rmaps_ppr.c b/orte/mca/rmaps/ppr/rmaps_ppr.c index 4b97076b11..6fa55b1599 100644 --- a/orte/mca/rmaps/ppr/rmaps_ppr.c +++ b/orte/mca/rmaps/ppr/rmaps_ppr.c @@ -1,5 +1,7 @@ /* * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -72,9 +74,9 @@ static int ppr_mapper(orte_job_t *jdata) /* only handle initial launch of loadbalanced * or NPERxxx jobs - allow restarting of failed apps */ - if (ORTE_JOB_STATE_INIT != jdata->state) { + if (ORTE_JOB_CONTROL_RESTART & jdata->controls) { opal_output_verbose(5, orte_rmaps_base.rmaps_output, - "mca:rmaps:ppr: job %s not in initial state - ppr cannot map", + "mca:rmaps:ppr: job %s being restarted - ppr cannot map", ORTE_JOBID_PRINT(jdata->jobid)); return ORTE_ERR_TAKE_NEXT_OPTION; } diff --git a/orte/mca/rmaps/rank_file/rmaps_rank_file.c b/orte/mca/rmaps/rank_file/rmaps_rank_file.c index 791e968ede..02bfef03f5 100644 --- a/orte/mca/rmaps/rank_file/rmaps_rank_file.c +++ b/orte/mca/rmaps/rank_file/rmaps_rank_file.c @@ -10,7 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2006-2011 Cisco Systems, Inc. All rights reserved. - * + * Copyright (c) 2011 Los Alamos National Security, LLC. + * All rights reserved. * Copyright (c) 2008 Voltaire. All rights reserved * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. * @@ -82,9 +83,9 @@ static int orte_rmaps_rf_map(orte_job_t *jdata) bool initial_map=true; /* only handle initial launch of rf job */ - if (ORTE_JOB_STATE_INIT != jdata->state) { + if (ORTE_JOB_CONTROL_RESTART & jdata->controls) { opal_output_verbose(5, orte_rmaps_base.rmaps_output, - "mca:rmaps:rf: job %s not in initial state - rank_file cannot map", + "mca:rmaps:rf: job %s being restarted - rank_file cannot map", ORTE_JOBID_PRINT(jdata->jobid)); return ORTE_ERR_TAKE_NEXT_OPTION; } diff --git a/orte/mca/rmaps/resilient/rmaps_resilient.c b/orte/mca/rmaps/resilient/rmaps_resilient.c index 7f5f4d1871..92b8ac67b7 100644 --- a/orte/mca/rmaps/resilient/rmaps_resilient.c +++ b/orte/mca/rmaps/resilient/rmaps_resilient.c @@ -3,6 +3,8 @@ * Copyright (c) 2009-2010 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. + * All rights reserved. * * $COPYRIGHT$ * @@ -71,7 +73,7 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata) mca_base_component_t *c = &mca_rmaps_resilient_component.super.base_version; bool found; - if (ORTE_JOB_STATE_INIT == jdata->state) { + if (!(ORTE_JOB_CONTROL_RESTART & jdata->controls)) { if (NULL != jdata->map->req_mapper && 0 != strcasecmp(jdata->map->req_mapper, c->mca_component_name)) { /* a mapper has been specified, and it isn't me */ @@ -86,8 +88,7 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata) ORTE_JOBID_PRINT(jdata->jobid)); return ORTE_ERR_TAKE_NEXT_OPTION; } - } else if (ORTE_JOB_STATE_RESTART != jdata->state && - ORTE_JOB_STATE_PROCS_MIGRATING != jdata->state) { + } else if (!(ORTE_JOB_CONTROL_PROCS_MIGRATING & jdata->controls)) { opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps:resilient: cannot map job %s - not in restart or migrating", ORTE_JOBID_PRINT(jdata->jobid)); diff --git a/orte/mca/rmaps/rmaps.h b/orte/mca/rmaps/rmaps.h index 1215d188e5..cfcb2df28e 100644 --- a/orte/mca/rmaps/rmaps.h +++ b/orte/mca/rmaps/rmaps.h @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -57,18 +59,10 @@ BEGIN_C_DECLS * rmaps module functions */ -/** - * Public API - */ -typedef int (*orte_rmaps_base_API_map_fn_t)(orte_job_t *jdata); - -/* global structure for accessing RMAPS API's */ -typedef struct { - orte_rmaps_base_API_map_fn_t map_job; -} orte_rmaps_t; - -ORTE_DECLSPEC extern orte_rmaps_t orte_rmaps; - +/* mapping event - the event one activates to schedule mapping + * of procs to nodes for pending jobs + */ +ORTE_DECLSPEC extern opal_event_t orte_mapping_event; /** * RMAPS module functions - these are not accessible to the outside world, diff --git a/orte/mca/rmaps/round_robin/rmaps_rr.c b/orte/mca/rmaps/round_robin/rmaps_rr.c index bca474365b..f5c6f37e42 100644 --- a/orte/mca/rmaps/round_robin/rmaps_rr.c +++ b/orte/mca/rmaps/round_robin/rmaps_rr.c @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2006-2011 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -57,11 +59,10 @@ static int orte_rmaps_rr_map(orte_job_t *jdata) * when rr mapping is desired - allow * restarting of failed apps */ - if (ORTE_JOB_STATE_INIT != jdata->state) { + if (ORTE_JOB_CONTROL_RESTART & jdata->controls) { opal_output_verbose(5, orte_rmaps_base.rmaps_output, - "mca:rmaps:rr: job %s in state %s - rr cannot map", - ORTE_JOBID_PRINT(jdata->jobid), - orte_job_state_to_str(jdata->state)); + "mca:rmaps:rr: job %s is being restarted - rr cannot map", + ORTE_JOBID_PRINT(jdata->jobid)); return ORTE_ERR_TAKE_NEXT_OPTION; } if (NULL != jdata->map->req_mapper && diff --git a/orte/mca/rmaps/seq/rmaps_seq.c b/orte/mca/rmaps/seq/rmaps_seq.c index d11c993c31..1fb66983fb 100644 --- a/orte/mca/rmaps/seq/rmaps_seq.c +++ b/orte/mca/rmaps/seq/rmaps_seq.c @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2006-2011 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -82,9 +84,9 @@ static int orte_rmaps_seq_map(orte_job_t *jdata) * when seq mapping is desired - allow * restarting of failed apps */ - if (ORTE_JOB_STATE_INIT != jdata->state) { + if (ORTE_JOB_CONTROL_RESTART & jdata->controls) { opal_output_verbose(5, orte_rmaps_base.rmaps_output, - "mca:rmaps:seq: job %s not in initial state - seq cannot map", + "mca:rmaps:seq: job %s is being restarted - seq cannot map", ORTE_JOBID_PRINT(jdata->jobid)); return ORTE_ERR_TAKE_NEXT_OPTION; } diff --git a/orte/mca/rml/base/rml_base_components.c b/orte/mca/rml/base/rml_base_components.c index 8592e80aa1..1bf22ad497 100644 --- a/orte/mca/rml/base/rml_base_components.c +++ b/orte/mca/rml/base/rml_base_components.c @@ -2,6 +2,8 @@ * Copyright (c) 2004-2011 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -63,28 +65,6 @@ static bool component_open_called = false; static bool opened = false; static bool selected = false; -/* instantiate the msg_pkt object */ -static void msg_pkt_constructor(orte_msg_packet_t *pkt) -{ - pkt->sender.jobid = ORTE_JOBID_INVALID; - pkt->sender.vpid = ORTE_VPID_INVALID; - ORTE_EPOCH_SET(pkt->sender.epoch,ORTE_EPOCH_MIN); - pkt->buffer = NULL; -} -static void msg_pkt_destructor(orte_msg_packet_t *pkt) -{ - pkt->sender.jobid = ORTE_JOBID_INVALID; - pkt->sender.vpid = ORTE_VPID_INVALID; - ORTE_EPOCH_SET(pkt->sender.epoch,ORTE_EPOCH_INVALID); - if (NULL != pkt->buffer) { - OBJ_RELEASE(pkt->buffer); - } -} -OBJ_CLASS_INSTANCE(orte_msg_packet_t, - opal_list_item_t, - msg_pkt_constructor, - msg_pkt_destructor); - int orte_rml_base_open(void) { @@ -276,3 +256,11 @@ orte_rml_base_close(void) return ORTE_SUCCESS; } + +void orte_rml_send_callback(int status, orte_process_name_t* sender, + opal_buffer_t* buffer, orte_rml_tag_t tag, + void* cbdata) + +{ + OBJ_RELEASE(buffer); +} diff --git a/orte/mca/rml/base/rml_base_contact.c b/orte/mca/rml/base/rml_base_contact.c index a8460aa332..7ea550eedc 100644 --- a/orte/mca/rml/base/rml_base_contact.c +++ b/orte/mca/rml/base/rml_base_contact.c @@ -142,12 +142,10 @@ int orte_rml_base_update_contact_info(opal_buffer_t* data) orte_process_info.max_procs = orte_process_info.num_procs; } - /* if we changed it, then we better update the routed - * tree so daemon collectives work correctly + /* if we changed it, then we better update the routing + * plan so daemon collectives work correctly */ - if (ORTE_SUCCESS != (rc = orte_routed.update_routing_tree(ORTE_PROC_MY_NAME->jobid))) { - ORTE_ERROR_LOG(rc); - } + orte_routed.update_routing_plan(); } return ORTE_SUCCESS; diff --git a/orte/mca/rml/base/rml_base_receive.c b/orte/mca/rml/base/rml_base_receive.c index 8ae61a98a0..e888c2d668 100644 --- a/orte/mca/rml/base/rml_base_receive.c +++ b/orte/mca/rml/base/rml_base_receive.c @@ -10,7 +10,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2007 Los Alamos National Security, LLC. All rights + * Copyright (c) 2007-2011 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ * @@ -60,7 +60,7 @@ int orte_rml_base_comm_start(void) if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_RML_INFO_UPDATE, - ORTE_RML_NON_PERSISTENT, + ORTE_RML_PERSISTENT, orte_rml_base_recv, NULL))) { ORTE_ERROR_LOG(rc); @@ -87,29 +87,35 @@ int orte_rml_base_comm_stop(void) return rc; } -static void process_message(int fd, short event, void *data) +/* handle message from proxies + * NOTE: The incoming buffer "buffer" is OBJ_RELEASED by the calling program. + * DO NOT RELEASE THIS BUFFER IN THIS CODE + */ +static void +orte_rml_base_recv(int status, orte_process_name_t* sender, + opal_buffer_t* buffer, orte_rml_tag_t tag, + void* cbdata) { - orte_message_event_t *mev = (orte_message_event_t*)data; orte_rml_cmd_flag_t command; orte_std_cntr_t count; - opal_buffer_t buf; + opal_buffer_t *buf; int rc; OPAL_OUTPUT_VERBOSE((5, orte_rml_base_output, "%s rml:base:recv: processing message from %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&mev->sender))); + ORTE_NAME_PRINT(sender))); count = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(mev->buffer, &command, &count, ORTE_RML_CMD))) { + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &command, &count, ORTE_RML_CMD))) { ORTE_ERROR_LOG(rc); return; } switch (command) { case ORTE_RML_UPDATE_CMD: - if (ORTE_SUCCESS != (rc = orte_rml_base_update_contact_info(mev->buffer))) { + if (ORTE_SUCCESS != (rc = orte_rml_base_update_contact_info(buffer))) { ORTE_ERROR_LOG(rc); return; } @@ -129,47 +135,13 @@ static void process_message(int fd, short event, void *data) OPAL_OUTPUT_VERBOSE((5, orte_rml_base_output, "%s rml:base:recv: sending ack to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&mev->sender))); + ORTE_NAME_PRINT(sender))); - OBJ_CONSTRUCT(&buf, opal_buffer_t); - if (0 > (rc = orte_rml.send_buffer(&mev->sender, &buf, ORTE_RML_TAG_UPDATE_ROUTE_ACK, 0))) { + buf = OBJ_NEW(opal_buffer_t); + if (0 > (rc = orte_rml.send_buffer_nb(sender, buf, ORTE_RML_TAG_UPDATE_ROUTE_ACK, 0, + orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); + OBJ_RELEASE(buf); } - OBJ_DESTRUCT(&buf); - - OBJ_RELEASE(mev); + OBJ_RELEASE(buf); } - -/* - * handle message from proxies - * NOTE: The incoming buffer "buffer" is OBJ_RELEASED by the calling program. - * DO NOT RELEASE THIS BUFFER IN THIS CODE - */ - -static void -orte_rml_base_recv(int status, orte_process_name_t* sender, - opal_buffer_t* buffer, orte_rml_tag_t tag, - void* cbdata) -{ - int rc; - - /* don't process this right away - we need to get out of the recv before - * we process the message as it may ask us to do something that involves - * more messaging! Instead, setup an event so that the message gets processed - * as soon as we leave the recv. - * - * The macro makes a copy of the buffer, which we release above - the incoming - * buffer, however, is NOT released here, although its payload IS transferred - * to the message buffer for later processing - */ - ORTE_MESSAGE_EVENT(sender, buffer, tag, process_message); - - if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, - ORTE_RML_TAG_RML_INFO_UPDATE, - ORTE_RML_NON_PERSISTENT, - orte_rml_base_recv, - NULL))) { - ORTE_ERROR_LOG(rc); - } -} - diff --git a/orte/mca/rml/oob/rml_oob_component.c b/orte/mca/rml/oob/rml_oob_component.c index 986b27e174..1aa69a5b3b 100644 --- a/orte/mca/rml/oob/rml_oob_component.c +++ b/orte/mca/rml/oob/rml_oob_component.c @@ -167,7 +167,7 @@ rml_oob_init(int* priority) if (NULL == orte_rml_oob_module.timer_event) { return NULL; } - opal_event_evtimer_set(opal_event_base, orte_rml_oob_module.timer_event, + opal_event_evtimer_set(orte_event_base, orte_rml_oob_module.timer_event, rml_oob_queued_progress, NULL); diff --git a/orte/mca/rml/rml.h b/orte/mca/rml/rml.h index c2bd98dfe6..473feb2723 100644 --- a/orte/mca/rml/rml.h +++ b/orte/mca/rml/rml.h @@ -9,6 +9,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -54,6 +56,14 @@ struct orte_process_name_t; struct orte_rml_module_t; +/* Provide a generic callback function to release buffers + * following a non-blocking send as this happens all over + * the code base + */ +ORTE_DECLSPEC void orte_rml_send_callback(int status, orte_process_name_t* sender, + opal_buffer_t* buffer, orte_rml_tag_t tag, + void* cbdata); + /* ******************************************************************** */ diff --git a/orte/mca/rml/rml_types.h b/orte/mca/rml/rml_types.h index aa71907f54..2df4003c43 100644 --- a/orte/mca/rml/rml_types.h +++ b/orte/mca/rml/rml_types.h @@ -9,7 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2007 Los Alamos National Security, LLC. All rights + * Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ @@ -44,65 +44,6 @@ BEGIN_C_DECLS - -/* ******************************************************************** */ - -typedef struct { - opal_list_item_t super; - orte_process_name_t sender; - opal_buffer_t *buffer; -} orte_msg_packet_t; -ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_msg_packet_t); - -#ifndef __WINDOWS__ -#define ORTE_PROCESS_MESSAGE(rlist, lck, flg, fd, crt, sndr, buf) \ - do { \ - orte_msg_packet_t *pkt; \ - int data=1; \ - pkt = OBJ_NEW(orte_msg_packet_t); \ - pkt->sender.jobid = (sndr)->jobid; \ - pkt->sender.vpid = (sndr)->vpid; \ - ORTE_EPOCH_SET(pkt->sender.epoch,(sndr)->epoch); \ - if ((crt)) { \ - pkt->buffer = OBJ_NEW(opal_buffer_t); \ - opal_dss.copy_payload(pkt->buffer, *(buf)); \ - } else { \ - pkt->buffer = *(buf); \ - *(buf) = NULL; \ - } \ - OPAL_THREAD_LOCK((lck)); \ - opal_list_append((rlist), &pkt->super); \ - if (!(flg)) { \ - write((fd), &data, sizeof(data)); \ - } \ - OPAL_THREAD_UNLOCK((lck)); \ - } while(0); -#else -#define ORTE_PROCESS_MESSAGE(rlist, lck, flg, fd, crt, sndr, buf) \ - do { \ - orte_msg_packet_t *pkt; \ - int data=1; \ - pkt = OBJ_NEW(orte_msg_packet_t); \ - pkt->sender.jobid = (sndr)->jobid; \ - pkt->sender.vpid = (sndr)->vpid; \ - ORTE_EPOCH_SET(pkt->sender.epoch,(sndr)->epoch); \ - if ((crt)) { \ - pkt->buffer = OBJ_NEW(opal_buffer_t); \ - opal_dss.copy_payload(pkt->buffer, *(buf)); \ - } else { \ - pkt->buffer = *(buf); \ - *(buf) = NULL; \ - } \ - OPAL_THREAD_LOCK((lck)); \ - opal_list_append((rlist), &pkt->super); \ - if (!(flg)) { \ - send((fd), (const char*) &data, sizeof(data), 0); \ - } \ - OPAL_THREAD_UNLOCK((lck)); \ - } while(0); -#endif - - /** * Constant tag values for well-known services */ @@ -125,10 +66,7 @@ ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_msg_packet_t); #define ORTE_RML_TAG_CKPT 13 #define ORTE_RML_TAG_RML_ROUTE 14 - -#define ORTE_RML_TAG_ALLGATHER 15 -#define ORTE_RML_TAG_ALLGATHER_LIST 16 -#define ORTE_RML_TAG_BARRIER 17 +#define ORTE_RML_TAG_XCAST 15 #define ORTE_RML_TAG_UPDATE_ROUTE_ACK 19 #define ORTE_RML_TAG_SYNC 20 @@ -154,8 +92,11 @@ ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_msg_packet_t); /* timing related */ #define ORTE_RML_TAG_COLLECTIVE_TIMER 29 -/* daemon collectives */ -#define ORTE_RML_TAG_DAEMON_COLLECTIVE 30 +/* collectives */ +#define ORTE_RML_TAG_COLLECTIVE 30 +#define ORTE_RML_TAG_COLL_ID 50 +#define ORTE_RML_TAG_DAEMON_COLL 52 +#define ORTE_RML_TAG_COLL_ID_REQ 53 /* show help */ #define ORTE_RML_TAG_SHOW_HELP 31 @@ -191,10 +132,6 @@ ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_msg_packet_t); #define ORTE_RML_TAG_SUBSCRIBE 46 -#if ORTE_ENABLE_EPOCH -/* For Epoch Updates */ -#define ORTE_RML_TAG_EPOCH_CHANGE 47 -#endif /* Notify of failed processes */ #define ORTE_RML_TAG_FAILURE_NOTICE 48 diff --git a/orte/mca/routed/base/Makefile.am b/orte/mca/routed/base/Makefile.am index b094611b5b..f7200a78c2 100644 --- a/orte/mca/routed/base/Makefile.am +++ b/orte/mca/routed/base/Makefile.am @@ -17,5 +17,5 @@ libmca_routed_la_SOURCES += \ if !ORTE_DISABLE_FULL_SUPPORT libmca_routed_la_SOURCES += \ - base/routed_base_register_sync.c + base/routed_base_fns.c endif diff --git a/orte/mca/routed/base/base.h b/orte/mca/routed/base/base.h index b0f559b9af..457000cc3e 100644 --- a/orte/mca/routed/base/base.h +++ b/orte/mca/routed/base/base.h @@ -42,6 +42,13 @@ ORTE_DECLSPEC extern opal_condition_t orte_routed_base_cond; ORTE_DECLSPEC extern bool orte_routed_base_wait_sync; ORTE_DECLSPEC extern opal_pointer_array_t orte_routed_jobfams; +ORTE_DECLSPEC void orte_routed_base_xcast_routing(orte_grpcomm_collective_t *coll, + opal_list_t *my_children); +ORTE_DECLSPEC void orte_routed_base_coll_relay_routing(orte_grpcomm_collective_t *coll); +ORTE_DECLSPEC void orte_routed_base_coll_complete_routing(orte_grpcomm_collective_t *coll); +ORTE_DECLSPEC void orte_routed_base_coll_peers(orte_grpcomm_collective_t *coll, + opal_list_t *my_children); + ORTE_DECLSPEC int orte_routed_base_register_sync(bool setup); ORTE_DECLSPEC int orte_routed_base_process_callback(orte_jobid_t job, opal_buffer_t *buffer); diff --git a/orte/mca/routed/base/routed_base_components.c b/orte/mca/routed/base/routed_base_components.c index bb524c7c68..c8cebfe639 100644 --- a/orte/mca/routed/base/routed_base_components.c +++ b/orte/mca/routed/base/routed_base_components.c @@ -65,7 +65,6 @@ static void jfamconst(orte_routed_jobfam_t *ptr) { ptr->route.jobid = ORTE_JOBID_INVALID; ptr->route.vpid = ORTE_VPID_INVALID; - ORTE_EPOCH_SET(ptr->route.epoch,ORTE_EPOCH_MIN); ptr->hnp_uri = NULL; } static void jfamdest(orte_routed_jobfam_t *ptr) @@ -117,7 +116,6 @@ orte_routed_base_open(void) jfam = OBJ_NEW(orte_routed_jobfam_t); jfam->route.jobid = ORTE_PROC_MY_HNP->jobid; jfam->route.vpid = ORTE_PROC_MY_HNP->vpid; - ORTE_EPOCH_SET(jfam->route.epoch,ORTE_PROC_MY_HNP->epoch); jfam->job_family = ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid); if (NULL != orte_process_info.my_hnp_uri) { jfam->hnp_uri = strdup(orte_process_info.my_hnp_uri); @@ -252,7 +250,6 @@ void orte_routed_base_update_hnps(opal_buffer_t *buf) jfam->job_family = jobfamily; jfam->route.jobid = name.jobid; jfam->route.vpid = name.vpid; - ORTE_EPOCH_SET(jfam->route.epoch,name.epoch); jfam->hnp_uri = strdup(uri); done: free(uri); diff --git a/orte/mca/routed/base/routed_base_fns.c b/orte/mca/routed/base/routed_base_fns.c new file mode 100644 index 0000000000..bbc0c09fc5 --- /dev/null +++ b/orte/mca/routed/base/routed_base_fns.c @@ -0,0 +1,379 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2011 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" +#include "orte/constants.h" +#include "orte/types.h" + +#include "opal/dss/dss.h" +#include "opal/runtime/opal_progress.h" + +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/ess/ess.h" +#include "orte/mca/odls/odls_types.h" +#include "orte/mca/rml/rml.h" +#include "orte/runtime/orte_globals.h" + +#include "orte/mca/routed/base/base.h" + +void orte_routed_base_xcast_routing(orte_grpcomm_collective_t *coll, + opal_list_t *my_children) +{ + opal_list_item_t *item; + orte_routed_tree_t *child; + orte_namelist_t *nm; + int i; + orte_proc_t *proc; + orte_job_t *daemons; + + /* if we are the HNP and an abnormal termination is underway, + * then send it directly to everyone + */ + if (ORTE_PROC_IS_HNP) { + if (orte_abnormal_term_ordered) { + daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); + for (i=1; i < daemons->procs->size; i++) { + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(daemons->procs, i))) { + continue; + } + nm = OBJ_NEW(orte_namelist_t); + nm->name.jobid = ORTE_PROC_MY_NAME->jobid; + nm->name.vpid = proc->name.vpid; + opal_list_append(&coll->targets, &nm->super); + } + } else { + /* the binomial xcast always goes to our children */ + for (item = opal_list_get_first(my_children); + item != opal_list_get_end(my_children); + item = opal_list_get_next(item)) { + child = (orte_routed_tree_t*)item; + nm = OBJ_NEW(orte_namelist_t); + nm->name.jobid = ORTE_PROC_MY_NAME->jobid; + nm->name.vpid = child->vpid; + opal_list_append(&coll->targets, &nm->super); + } + } + } else { + /* I am a daemon - route to my children */ + for (item = opal_list_get_first(my_children); + item != opal_list_get_end(my_children); + item = opal_list_get_next(item)) { + child = (orte_routed_tree_t*)item; + nm = OBJ_NEW(orte_namelist_t); + nm->name.jobid = ORTE_PROC_MY_NAME->jobid; + nm->name.vpid = child->vpid; + opal_list_append(&coll->targets, &nm->super); + } + } +} + +void orte_routed_base_coll_relay_routing(orte_grpcomm_collective_t *coll) +{ + opal_list_item_t *item, *itm; + orte_namelist_t *nm, *n2, *n3; + bool dup; + + if (ORTE_PROC_IS_HNP) { + /* nobody to send to */ + return; + } + /* if we are a daemon, then we look at the list of + * participants. If there is a wildcard, then we + * know that all procs are participating, so we + * can send it to our parent. If not, then we have + * to send the collective to the daemon hosting + * the participating proc + */ + for (item = opal_list_get_first(&coll->participants); + item != opal_list_get_end(&coll->participants); + item = opal_list_get_next(item)) { + n2 = (orte_namelist_t*)item; + nm = OBJ_NEW(orte_namelist_t); + nm->name.jobid = ORTE_PROC_MY_NAME->jobid; + dup = false; + if (ORTE_VPID_WILDCARD == n2->name.vpid) { + nm->name.vpid = ORTE_PROC_MY_PARENT->vpid; + } else { + nm->name.vpid = orte_ess.proc_get_daemon(&n2->name); + } + /* if it is me, then ignore */ + if (nm->name.vpid == ORTE_PROC_MY_NAME->vpid) { + dup = true; + } else { + /* if it is already on the list, we ignore */ + for (itm = opal_list_get_first(&coll->targets); + itm != opal_list_get_end(&coll->targets); + itm = opal_list_get_next(itm)) { + n3 = (orte_namelist_t*)itm; + if (n3->name.vpid == nm->name.vpid) { + /* duplicate */ + dup = true; + break; + } + } + } + if (dup) { + OBJ_RELEASE(nm); + } else { + opal_list_append(&coll->targets, &nm->super); + } + } +} + +void orte_routed_base_coll_complete_routing(orte_grpcomm_collective_t *coll) +{ + opal_list_item_t *item; + orte_namelist_t *nm, *n2; + int i; + orte_proc_t *proc; + + if (ORTE_PROC_IS_HNP) { + /* send it to everyone that participated */ + for (item = opal_list_get_first(&coll->participants); + item != opal_list_get_end(&coll->participants); + item = opal_list_get_next(item)) { + n2 = (orte_namelist_t*)item; + /* if the vpid is wildcard, then the result will go + * to everyone in the job via xcast, so just carry it + * across + */ + if (ORTE_VPID_WILDCARD == n2->name.vpid) { + nm = OBJ_NEW(orte_namelist_t); + nm->name.jobid = n2->name.jobid; + nm->name.vpid = n2->name.vpid; + opal_list_append(&coll->targets, &nm->super); + } else { + /* only include it if the proc is local to us */ + for (i=0; i < orte_local_children->size; i++) { + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { + continue; + } + if (proc->name.jobid == n2->name.jobid && + proc->name.vpid == n2->name.vpid) { + nm = OBJ_NEW(orte_namelist_t); + nm->name.jobid = n2->name.jobid; + nm->name.vpid = n2->name.vpid; + opal_list_append(&coll->targets, &nm->super); + break; + } + } + } + } + } else { + /* if the participants are wildcard, then the HNP will + * be sending the result to the procs via xcast. For all + * other cases, handle our own local children + */ + for (item = opal_list_get_first(&coll->participants); + item != opal_list_get_end(&coll->participants); + item = opal_list_get_next(item)) { + n2 = (orte_namelist_t*)item; + if (ORTE_VPID_WILDCARD == n2->name.vpid) { + continue; + } + for (i=0; i < orte_local_children->size; i++) { + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { + continue; + } + if (proc->name.jobid == n2->name.jobid && + proc->name.vpid == n2->name.vpid) { + nm = OBJ_NEW(orte_namelist_t); + nm->name.jobid = n2->name.jobid; + nm->name.vpid = n2->name.vpid; + opal_list_append(&coll->targets, &nm->super); + break; + } + } + } + } +} + +void orte_routed_base_coll_peers(orte_grpcomm_collective_t *coll, + opal_list_t *my_children) +{ + opal_list_item_t *item; + orte_routed_tree_t *child; + orte_namelist_t *nm; + + /* tree-based systems require input from their children */ + for (item = opal_list_get_first(my_children); + item != opal_list_get_end(my_children); + item = opal_list_get_next(item)) { + child = (orte_routed_tree_t*)item; + nm = OBJ_NEW(orte_namelist_t); + nm->name.jobid = ORTE_PROC_MY_NAME->jobid; + nm->name.vpid = child->vpid; + opal_list_append(&coll->targets, &nm->super); + } + } + + +static bool sync_recvd; + +static void report_sync(int status, orte_process_name_t* sender, + opal_buffer_t *buffer, + orte_rml_tag_t tag, void *cbdata) +{ + /* just copy the payload to the sync_buf */ + opal_dss.copy_payload(orte_process_info.sync_buf, buffer); + /* flag as complete */ + sync_recvd = true; +} + +int orte_routed_base_register_sync(bool setup) +{ + opal_buffer_t *buffer; + int rc; + orte_daemon_cmd_flag_t command=ORTE_DAEMON_SYNC_BY_PROC; + char *rml_uri; + + OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output, + "%s registering sync to daemon %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(ORTE_PROC_MY_DAEMON))); + + /* we need to get the oob to establish + * the connection - the oob will leave the connection "alive" + * thereafter so we can communicate readily + */ + + buffer = OBJ_NEW(opal_buffer_t); + + /* if we are setting up, tell the daemon to send back a nidmap */ + if (setup) { + command = ORTE_DAEMON_SYNC_WANT_NIDMAP; + } + + + /* tell the daemon to sync */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &command, 1, ORTE_DAEMON_CMD))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(buffer); + return rc; + } + + /* add our contact info to the buffer so the daemon can explicitly + * store it + */ + rml_uri = orte_rml.get_contact_info(); + if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &rml_uri, 1, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(buffer); + free(rml_uri); + return rc; + } + if (NULL != rml_uri) free(rml_uri); + + /* send the sync command to our daemon */ + if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_DAEMON, buffer, + ORTE_RML_TAG_DAEMON, 0, + orte_rml_send_callback, NULL))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + /* get the ack - need this to ensure that the sync communication + * gets serviced by the event library on the orted prior to the + * process exiting + */ + OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output, + "%s registering sync waiting for ack", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + sync_recvd = false; + rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_SYNC, + ORTE_RML_NON_PERSISTENT, report_sync, NULL); + if (rc != ORTE_SUCCESS && rc != ORTE_ERR_NOT_IMPLEMENTED) { + ORTE_ERROR_LOG(rc); + return rc; + } + + /* it is okay to block here as we are -not- in an event */ + while (!sync_recvd) { + opal_progress(); + } + + OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output, + "%s registering sync ack recvd", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + + return ORTE_SUCCESS; +} + +int orte_routed_base_process_callback(orte_jobid_t job, opal_buffer_t *buffer) +{ + orte_proc_t *proc; + orte_job_t *jdata; + orte_std_cntr_t cnt; + char *rml_uri; + orte_vpid_t vpid; + int rc; + + if (ORTE_JOB_FAMILY(job) == ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) { + /* came from singleton - don't process it */ + return ORTE_SUCCESS; + } + + /* lookup the job object for this process */ + if (NULL == (jdata = orte_get_job_data_object(job))) { + /* came from my job family - this is an error */ + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + return ORTE_ERR_NOT_FOUND; + } + + /* unpack the data for each entry */ + cnt = 1; + while (ORTE_SUCCESS == (rc = opal_dss.unpack(buffer, &vpid, &cnt, ORTE_VPID))) { + + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &rml_uri, &cnt, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + continue; + } + + OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output, + "%s routed_binomial:callback got uri %s for job %s rank %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (NULL == rml_uri) ? "NULL" : rml_uri, + ORTE_JOBID_PRINT(job), ORTE_VPID_PRINT(vpid))); + + if (NULL == rml_uri) { + /* should not happen */ + ORTE_ERROR_LOG(ORTE_ERR_FATAL); + return ORTE_ERR_FATAL; + } + + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, vpid))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + continue; + } + + /* update the record */ + proc->rml_uri = strdup(rml_uri); + free(rml_uri); + + cnt = 1; + } + if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) { + ORTE_ERROR_LOG(rc); + return rc; + } + + return ORTE_SUCCESS; +} diff --git a/orte/mca/routed/base/routed_base_register_sync.c b/orte/mca/routed/base/routed_base_register_sync.c deleted file mode 100644 index 6ca973d398..0000000000 --- a/orte/mca/routed/base/routed_base_register_sync.c +++ /dev/null @@ -1,195 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2011 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "orte_config.h" -#include "orte/constants.h" -#include "orte/types.h" - -#include "opal/dss/dss.h" -#include "opal/threads/threads.h" - -#include "orte/mca/errmgr/errmgr.h" -#include "orte/mca/odls/odls_types.h" -#include "orte/mca/rml/rml.h" -#include "orte/runtime/orte_globals.h" - -#include "orte/mca/routed/base/base.h" - -static bool sync_recvd; - -static void report_sync(int status, orte_process_name_t* sender, - opal_buffer_t *buffer, - orte_rml_tag_t tag, void *cbdata) -{ - /* just copy the payload to the sync_buf */ - opal_dss.copy_payload(orte_process_info.sync_buf, buffer); - /* flag as complete */ - sync_recvd = true; -} - -int orte_routed_base_register_sync(bool setup) -{ - opal_buffer_t buffer; - int rc; - orte_daemon_cmd_flag_t command=ORTE_DAEMON_SYNC_BY_PROC; - char *rml_uri; - - OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output, - "%s registering sync to daemon %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(ORTE_PROC_MY_DAEMON))); - - /* we need to get the oob to establish - * the connection - the oob will leave the connection "alive" - * thereafter so we can communicate readily - */ - - OBJ_CONSTRUCT(&buffer, opal_buffer_t); - - /* if we are setting up, tell the daemon to send back a nidmap */ - if (setup) { - command = ORTE_DAEMON_SYNC_WANT_NIDMAP; - } - - - /* tell the daemon to sync */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(&buffer, &command, 1, ORTE_DAEMON_CMD))) { - ORTE_ERROR_LOG(rc); - OBJ_DESTRUCT(&buffer); - return rc; - } - - /* add our contact info to the buffer so the daemon can explicitly - * store it - */ - rml_uri = orte_rml.get_contact_info(); - if (ORTE_SUCCESS != (rc = opal_dss.pack(&buffer, &rml_uri, 1, OPAL_STRING))) { - ORTE_ERROR_LOG(rc); - OBJ_DESTRUCT(&buffer); - free(rml_uri); - return rc; - } - if (NULL != rml_uri) free(rml_uri); - - /* send the sync command to our daemon */ - if (0 > (rc = orte_rml.send_buffer(ORTE_PROC_MY_DAEMON, &buffer, ORTE_RML_TAG_DAEMON, 0))) { - ORTE_ERROR_LOG(rc); - OBJ_DESTRUCT(&buffer); - return rc; - } - OBJ_DESTRUCT(&buffer); - - /* get the ack - need this to ensure that the sync communication - * gets serviced by the event library on the orted prior to the - * process exiting - */ - OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output, - "%s registering sync waiting for ack", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - sync_recvd = false; - rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_SYNC, - ORTE_RML_NON_PERSISTENT, report_sync, NULL); - if (rc != ORTE_SUCCESS && rc != ORTE_ERR_NOT_IMPLEMENTED) { - ORTE_ERROR_LOG(rc); - return rc; - } - - ORTE_PROGRESSED_WAIT(sync_recvd, 0, 1); - - OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output, - "%s registering sync ack recvd", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - return ORTE_SUCCESS; -} - -int orte_routed_base_process_callback(orte_jobid_t job, opal_buffer_t *buffer) -{ - orte_proc_t *proc; - orte_job_t *jdata; - orte_std_cntr_t cnt; - char *rml_uri; - orte_vpid_t vpid; -#if ORTE_ENABLE_EPOCH - orte_epoch_t epoch; -#endif - int rc; - - if (ORTE_JOB_FAMILY(job) == ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) { - /* came from singleton - don't process it */ - return ORTE_SUCCESS; - } - - /* lookup the job object for this process */ - if (NULL == (jdata = orte_get_job_data_object(job))) { - /* came from my job family - this is an error */ - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - return ORTE_ERR_NOT_FOUND; - } - - /* unpack the data for each entry */ - cnt = 1; - while (ORTE_SUCCESS == (rc = opal_dss.unpack(buffer, &vpid, &cnt, ORTE_VPID))) { - -#if ORTE_ENABLE_EPOCH - cnt = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &epoch, &cnt, ORTE_EPOCH))) { - ORTE_ERROR_LOG(rc); - continue; - } -#endif - - if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &rml_uri, &cnt, OPAL_STRING))) { - ORTE_ERROR_LOG(rc); - continue; - } - - OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output, - "%s routed_binomial:callback got uri %s for job %s rank %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - (NULL == rml_uri) ? "NULL" : rml_uri, - ORTE_JOBID_PRINT(job), ORTE_VPID_PRINT(vpid))); - - if (NULL == rml_uri) { - /* should not happen */ - ORTE_ERROR_LOG(ORTE_ERR_FATAL); - return ORTE_ERR_FATAL; - } - - if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, vpid))) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - continue; - } - - /* update the record */ - proc->rml_uri = strdup(rml_uri); - free(rml_uri); - - /* update the proc state */ - orte_errmgr.update_state(job, ORTE_JOB_STATE_UNDEF, - &proc->name, ORTE_PROC_STATE_RUNNING, 0, 0); - cnt = 1; - } - if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) { - ORTE_ERROR_LOG(rc); - return rc; - } - - return ORTE_SUCCESS; -} diff --git a/orte/mca/routed/binomial/routed_binomial.c b/orte/mca/routed/binomial/routed_binomial.c index 557b21bd2b..3947361ea7 100644 --- a/orte/mca/routed/binomial/routed_binomial.c +++ b/orte/mca/routed/binomial/routed_binomial.c @@ -4,6 +4,8 @@ * Copyright (c) 2004-2011 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -16,10 +18,10 @@ #include -#include "opal/threads/condition.h" #include "opal/dss/dss.h" #include "opal/class/opal_pointer_array.h" #include "opal/class/opal_bitmap.h" +#include "opal/runtime/opal_progress.h" #include "opal/util/bit_ops.h" #include "opal/util/output.h" @@ -48,8 +50,9 @@ static orte_process_name_t get_route(orte_process_name_t *target); static int init_routes(orte_jobid_t job, opal_buffer_t *ndat); static int route_lost(const orte_process_name_t *route); static bool route_is_defined(const orte_process_name_t *target); -static int update_routing_tree(orte_jobid_t jobid); -static orte_vpid_t get_routing_tree(opal_list_t *children); +static void update_routing_plan(void); +static void get_routing_list(orte_grpcomm_coll_t type, + orte_grpcomm_collective_t *coll); static int get_wireup_info(opal_buffer_t *buf); static int set_lifeline(orte_process_name_t *proc); static size_t num_routes(void); @@ -68,8 +71,8 @@ orte_routed_module_t orte_routed_binomial_module = { route_lost, route_is_defined, set_lifeline, - update_routing_tree, - get_routing_tree, + update_routing_plan, + get_routing_list, get_wireup_info, num_routes, #if OPAL_ENABLE_FT_CR == 1 @@ -80,8 +83,6 @@ orte_routed_module_t orte_routed_binomial_module = { }; /* local globals */ -static opal_condition_t cond; -static opal_mutex_t lock; static orte_process_name_t *lifeline=NULL; static orte_process_name_t local_lifeline; static int num_children; @@ -91,11 +92,6 @@ static bool hnp_direct=true; static int init(void) { - - /* setup the global condition and lock */ - OBJ_CONSTRUCT(&cond, opal_condition_t); - OBJ_CONSTRUCT(&lock, opal_mutex_t); - lifeline = NULL; /* setup the list of children */ @@ -123,10 +119,6 @@ static int finalize(void) } } - /* destruct the global condition and lock */ - OBJ_DESTRUCT(&cond); - OBJ_DESTRUCT(&lock); - lifeline = NULL; /* deconstruct the list of children */ @@ -145,14 +137,8 @@ static int delete_route(orte_process_name_t *proc) orte_routed_jobfam_t *jfam; uint16_t jfamily; -#if ORTE_ENABLE_EPOCH - if (proc->jobid == ORTE_JOBID_INVALID || - proc->vpid == ORTE_VPID_INVALID || - 0 == ORTE_EPOCH_CMP(proc->epoch,ORTE_EPOCH_INVALID)) { -#else if (proc->jobid == ORTE_JOBID_INVALID || proc->vpid == ORTE_VPID_INVALID) { -#endif return ORTE_ERR_BAD_PARAM; } @@ -219,14 +205,8 @@ static int update_route(orte_process_name_t *target, orte_routed_jobfam_t *jfam; uint16_t jfamily; -#if ORTE_ENABLE_EPOCH - if (target->jobid == ORTE_JOBID_INVALID || - target->vpid == ORTE_VPID_INVALID || - 0 == ORTE_EPOCH_CMP(target->epoch,ORTE_EPOCH_INVALID)) { -#else if (target->jobid == ORTE_JOBID_INVALID || target->vpid == ORTE_VPID_INVALID) { -#endif return ORTE_ERR_BAD_PARAM; } @@ -287,7 +267,6 @@ static int update_route(orte_process_name_t *target, ORTE_NAME_PRINT(route))); jfam->route.jobid = route->jobid; jfam->route.vpid = route->vpid; - ORTE_EPOCH_SET(jfam->route.epoch,orte_ess.proc_get_epoch(&jfam->route)); return ORTE_SUCCESS; } @@ -302,7 +281,6 @@ static int update_route(orte_process_name_t *target, jfam->job_family = jfamily; jfam->route.jobid = route->jobid; jfam->route.vpid = route->vpid; - ORTE_EPOCH_SET(jfam->route.epoch,orte_ess.proc_get_epoch(&jfam->route)); opal_pointer_array_add(&orte_routed_jobfams, jfam); return ORTE_SUCCESS; @@ -333,21 +311,9 @@ static orte_process_name_t get_route(orte_process_name_t *target) /* initialize */ daemon.jobid = ORTE_PROC_MY_DAEMON->jobid; daemon.vpid = ORTE_PROC_MY_DAEMON->vpid; - ORTE_EPOCH_SET(daemon.epoch,ORTE_PROC_MY_DAEMON->epoch); -#if ORTE_ENABLE_EPOCH - if (target->jobid == ORTE_JOBID_INVALID || - target->vpid == ORTE_VPID_INVALID || - 0 == ORTE_EPOCH_CMP(target->epoch,ORTE_EPOCH_INVALID)) { -#else if (target->jobid == ORTE_JOBID_INVALID || target->vpid == ORTE_VPID_INVALID) { -#endif - ret = ORTE_NAME_INVALID; - goto found; - } - - if (0 > ORTE_EPOCH_CMP(target->epoch, orte_ess.proc_get_epoch(target))) { ret = ORTE_NAME_INVALID; goto found; } @@ -443,7 +409,6 @@ static orte_process_name_t get_route(orte_process_name_t *target) goto found; } - startover: /* search routing tree for next step to that daemon */ for (item = opal_list_get_first(&my_children); item != opal_list_get_end(&my_children); @@ -459,13 +424,6 @@ static orte_process_name_t get_route(orte_process_name_t *target) /* yep - we need to step through this child */ daemon.vpid = child->vpid; - /* If the daemon to which we should be routing is dead, then update - * the routing tree and start over. */ - if (!PROC_IS_RUNNING(&daemon)) { - update_routing_tree(daemon.jobid); - goto startover; - } - ret = &daemon; goto found; } @@ -479,8 +437,6 @@ static orte_process_name_t get_route(orte_process_name_t *target) ret = &daemon; found: - ORTE_EPOCH_SET(daemon.epoch,orte_ess.proc_get_epoch(&daemon)); - OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output, "%s routed_binomial_get(%s) --> %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), @@ -490,28 +446,11 @@ static orte_process_name_t get_route(orte_process_name_t *target) return *ret; } -/* HANDLE ACK MESSAGES FROM AN HNP */ -static void release_ack(int fd, short event, void *data) -{ - orte_message_event_t *mev = (orte_message_event_t*)data; - ack_recvd = true; - OBJ_RELEASE(mev); -} - static void recv_ack(int status, orte_process_name_t* sender, opal_buffer_t* buffer, orte_rml_tag_t tag, void* cbdata) { - /* don't process this right away - we need to get out of the recv before - * we process the message as it may ask us to do something that involves - * more messaging! Instead, setup an event so that the message gets processed - * as soon as we leave the recv. - * - * The macro makes a copy of the buffer, which we release above - the incoming - * buffer, however, is NOT released here, although its payload IS transferred - * to the message buffer for later processing - */ - ORTE_MESSAGE_EVENT(sender, buffer, tag, release_ack); + ack_recvd = true; } @@ -682,7 +621,9 @@ static int init_routes(orte_jobid_t job, opal_buffer_t *ndat) rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_UPDATE_ROUTE_ACK, ORTE_RML_NON_PERSISTENT, recv_ack, NULL); - ORTE_PROGRESSED_WAIT(ack_recvd, 0, 1); + while (!ack_recvd) { + opal_progress(); + } OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output, "%s routed_binomial_init_routes: ack recvd", @@ -814,9 +755,10 @@ static int route_lost(const orte_process_name_t *route) if (!orte_finalizing && NULL != lifeline && OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, route, lifeline)) { - opal_output(0, "%s routed:binomial: Connection to lifeline %s lost", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(lifeline)); + OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output, + "%s routed:binomial: Connection to lifeline %s lost", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(lifeline))); return ORTE_ERR_FATAL; } @@ -891,7 +833,6 @@ static int set_lifeline(orte_process_name_t *proc) */ local_lifeline.jobid = proc->jobid; local_lifeline.vpid = proc->vpid; - ORTE_EPOCH_SET(local_lifeline.epoch,proc->epoch); lifeline = &local_lifeline; return ORTE_SUCCESS; @@ -899,15 +840,13 @@ static int set_lifeline(orte_process_name_t *proc) static int binomial_tree(int rank, int parent, int me, int num_procs, int *nchildren, opal_list_t *childrn, - opal_bitmap_t *relatives, bool mine, orte_jobid_t jobid) + opal_bitmap_t *relatives, bool mine) { int i, bitmap, peer, hibit, mask, found; orte_routed_tree_t *child; opal_bitmap_t *relations; orte_process_name_t proc_name; - proc_name.jobid = jobid; - OPAL_OUTPUT_VERBOSE((3, orte_routed_base_output, "%s routed:binomial rank %d parent %d me %d num_procs %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), @@ -936,27 +875,8 @@ static int binomial_tree(int rank, int parent, int me, int num_procs, * that process so we can check it's state. */ proc_name.vpid = peer; - ORTE_EPOCH_SET(proc_name.epoch,orte_util_lookup_epoch(&proc_name)); - if (!PROC_IS_RUNNING(&proc_name) - && 0 < ORTE_EPOCH_CMP(ORTE_EPOCH_MIN,proc_name.epoch) - && 0 != ORTE_EPOCH_CMP(ORTE_EPOCH_INVALID,proc_name.epoch)) { - OPAL_OUTPUT_VERBOSE((3, orte_routed_base_output, - "%s routed:binomial child %s is dead", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_VPID_PRINT(child->vpid))); - relations = relatives; - - /* Leave mine as it is. If it was true, then we want to - * inherit the dead node's children as our own. If it wasn't - * then we want it's relatives as our own. */ - binomial_tree(0, 0, peer, num_procs, nchildren, childrn, relations, mine, jobid); - - /* If we use the proc_is_running as a way of measuring of the - * process is dead, then we get screwed up on startup. By also - * testing the epoch, we make sure that the process really did - * start up and then died. */ - } else if (mine) { + if (mine) { /* this is a direct child - add it to my list */ opal_list_append(childrn, &child->super); (*nchildren)++; @@ -972,7 +892,7 @@ static int binomial_tree(int rank, int parent, int me, int num_procs, relations = relatives; } /* search for this child's relatives */ - binomial_tree(0, 0, peer, num_procs, nchildren, childrn, relations, false, jobid); + binomial_tree(0, 0, peer, num_procs, nchildren, childrn, relations, false); } } return parent; @@ -997,16 +917,9 @@ static int binomial_tree(int rank, int parent, int me, int num_procs, "%s routed:binomial find children computing tree", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* execute compute on this child */ - if (0 <= (found = binomial_tree(peer, rank, me, num_procs, nchildren, childrn, relatives, mine, jobid))) { + if (0 <= (found = binomial_tree(peer, rank, me, num_procs, nchildren, childrn, relatives, mine))) { proc_name.vpid = found; - if (!PROC_IS_RUNNING(&proc_name) - && 0 < ORTE_EPOCH_CMP(ORTE_EPOCH_MIN,orte_util_lookup_epoch(&proc_name))) { - OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output, - "%s routed:binomial find children proc out of date - returning parent %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), parent)); - return parent; - } OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output, "%s routed:binomial find children returning found value %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), found)); @@ -1017,7 +930,7 @@ static int binomial_tree(int rank, int parent, int me, int num_procs, return -1; } -static int update_routing_tree(orte_jobid_t jobid) +static void update_routing_plan(void) { orte_routed_tree_t *child; int j; @@ -1027,7 +940,7 @@ static int update_routing_tree(orte_jobid_t jobid) * is a meaningless command as I am not allowed to route */ if (!ORTE_PROC_IS_DAEMON && !ORTE_PROC_IS_HNP) { - return ORTE_ERR_NOT_SUPPORTED; + return; } /* clear the list of children if any are already present */ @@ -1041,8 +954,7 @@ static int update_routing_tree(orte_jobid_t jobid) */ ORTE_PROC_MY_PARENT->vpid = binomial_tree(0, 0, ORTE_PROC_MY_NAME->vpid, orte_process_info.max_procs, - &num_children, &my_children, NULL, true, jobid); - ORTE_EPOCH_SET(ORTE_PROC_MY_PARENT->epoch,orte_ess.proc_get_epoch(ORTE_PROC_MY_PARENT)); + &num_children, &my_children, NULL, true); if (0 < opal_output_get_verbosity(orte_routed_base_output)) { opal_output(0, "%s: parent %d num_children %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_PROC_MY_PARENT->vpid, num_children); @@ -1058,40 +970,28 @@ static int update_routing_tree(orte_jobid_t jobid) } } } - - return ORTE_SUCCESS; } -static orte_vpid_t get_routing_tree(opal_list_t *children) +static void get_routing_list(orte_grpcomm_coll_t type, + orte_grpcomm_collective_t *coll) { - opal_list_item_t *item; - orte_routed_tree_t *child; - orte_routed_tree_t *nm; - + /* if I am anything other than a daemon or the HNP, this * is a meaningless command as I am not allowed to route */ if (!ORTE_PROC_IS_DAEMON && !ORTE_PROC_IS_HNP) { - return ORTE_VPID_INVALID; + return; } - /* the binomial routing tree always goes to our children, - * for any job - */ - if (NULL != children) { - for (item = opal_list_get_first(&my_children); - item != opal_list_get_end(&my_children); - item = opal_list_get_next(item)) { - child = (orte_routed_tree_t*)item; - nm = OBJ_NEW(orte_routed_tree_t); - nm->vpid = child->vpid; - opal_bitmap_copy(&nm->relatives, &child->relatives); - opal_list_append(children, &nm->super); - } + if (ORTE_GRPCOMM_XCAST == type) { + orte_routed_base_xcast_routing(coll, &my_children); + } else if (ORTE_GRPCOMM_COLL_RELAY == type) { + orte_routed_base_coll_relay_routing(coll); + } else if (ORTE_GRPCOMM_COLL_COMPLETE == type) { + orte_routed_base_coll_complete_routing(coll); + } else if (ORTE_GRPCOMM_COLL_PEERS == type) { + orte_routed_base_coll_peers(coll, &my_children); } - - /* return my parent's vpid */ - return ORTE_PROC_MY_PARENT->vpid; } static int get_wireup_info(opal_buffer_t *buf) diff --git a/orte/mca/routed/cm/routed_cm.c b/orte/mca/routed/cm/routed_cm.c index 3e93d6d83a..bd01ecddc5 100644 --- a/orte/mca/routed/cm/routed_cm.c +++ b/orte/mca/routed/cm/routed_cm.c @@ -19,10 +19,10 @@ #include -#include "opal/threads/condition.h" #include "opal/dss/dss.h" #include "opal/class/opal_hash_table.h" #include "opal/class/opal_bitmap.h" +#include "opal/runtime/opal_progress.h" #include "opal/util/bit_ops.h" #include "opal/util/output.h" @@ -50,8 +50,9 @@ static orte_process_name_t get_route(orte_process_name_t *target); static int init_routes(orte_jobid_t job, opal_buffer_t *ndat); static int route_lost(const orte_process_name_t *route); static bool route_is_defined(const orte_process_name_t *target); -static int update_routing_tree(orte_jobid_t jobid); -static orte_vpid_t get_routing_tree(opal_list_t *children); +static void update_routing_plan(void); +static void get_routing_list(orte_grpcomm_coll_t type, + orte_grpcomm_collective_t *coll); static int get_wireup_info(opal_buffer_t *buf); static int set_lifeline(orte_process_name_t *proc); static size_t num_routes(void); @@ -70,8 +71,8 @@ orte_routed_module_t orte_routed_cm_module = { route_lost, route_is_defined, set_lifeline, - update_routing_tree, - get_routing_tree, + update_routing_plan, + get_routing_list, get_wireup_info, num_routes, #if OPAL_ENABLE_FT_CR == 1 @@ -82,8 +83,6 @@ orte_routed_module_t orte_routed_cm_module = { }; /* local globals */ -static opal_condition_t cond; -static opal_mutex_t lock; static orte_process_name_t *lifeline=NULL; static orte_process_name_t local_lifeline; static bool ack_recvd; @@ -91,10 +90,6 @@ static bool ack_recvd; static int init(void) { - /* setup the global condition and lock */ - OBJ_CONSTRUCT(&cond, opal_condition_t); - OBJ_CONSTRUCT(&lock, opal_mutex_t); - lifeline = NULL; return ORTE_SUCCESS; @@ -122,10 +117,6 @@ static int finalize(void) } cleanup: - /* destruct the global condition and lock */ - OBJ_DESTRUCT(&cond); - OBJ_DESTRUCT(&lock); - lifeline = NULL; return ORTE_SUCCESS; @@ -137,14 +128,8 @@ static int delete_route(orte_process_name_t *proc) orte_routed_jobfam_t *jfam; uint16_t jfamily; -#if ORTE_ENABLE_EPOCH - if (proc->jobid == ORTE_JOBID_INVALID || - proc->vpid == ORTE_VPID_INVALID || - 0 == ORTE_EPOCH_CMP(target->epoch,ORTE_EPOCH_INVALID)) { -#else if (proc->jobid == ORTE_JOBID_INVALID || proc->vpid == ORTE_VPID_INVALID) { -#endif return ORTE_ERR_BAD_PARAM; } @@ -203,14 +188,8 @@ static int update_route(orte_process_name_t *target, orte_routed_jobfam_t *jfam; uint16_t jfamily; -#if ORTE_ENABLE_EPOCH - if (target->jobid == ORTE_JOBID_INVALID || - target->vpid == ORTE_VPID_INVALID || - 0 == ORTE_EPOCH_CMP(target->epoch,ORTE_EPOCH_INVALID)) { -#else if (target->jobid == ORTE_JOBID_INVALID || target->vpid == ORTE_VPID_INVALID) { -#endif return ORTE_ERR_BAD_PARAM; } @@ -267,7 +246,6 @@ static int update_route(orte_process_name_t *target, ORTE_NAME_PRINT(route))); jfam->route.jobid = route->jobid; jfam->route.vpid = route->vpid; - ORTE_EPOCH_SET(jfam->route.epoch,orte_ess.proc_get_epoch(&jfam->route)); return ORTE_SUCCESS; } @@ -282,7 +260,6 @@ static int update_route(orte_process_name_t *target, jfam->job_family = jfamily; jfam->route.jobid = route->jobid; jfam->route.vpid = route->vpid; - ORTE_EPOCH_SET(jfam->route.epoch,orte_ess.proc_get_epoch(&jfam->route)); opal_pointer_array_add(&orte_routed_jobfams, jfam); return ORTE_SUCCESS; @@ -307,19 +284,8 @@ static orte_process_name_t get_route(orte_process_name_t *target) goto found; } -#if ORTE_ENABLE_EPOCH - if (target->jobid == ORTE_JOBID_INVALID || - target->vpid == ORTE_VPID_INVALID || - 0 == ORTE_EPOCH_CMP(proc->epoch,ORTE_EPOCH_INVALID)) { -#else if (target->jobid == ORTE_JOBID_INVALID || target->vpid == ORTE_VPID_INVALID) { -#endif - ret = ORTE_NAME_INVALID; - goto found; - } - - if (0 > ORTE_EPOCH_CMP(target->epoch, orte_ess.proc_get_epoch(target))) { ret = ORTE_NAME_INVALID; goto found; } @@ -386,9 +352,6 @@ static orte_process_name_t get_route(orte_process_name_t *target) goto found; } - /* Initialize daemon's epoch, based on its current vpid/jobid */ - ORTE_EPOCH_SET(daemon.epoch,orte_ess.proc_get_epoch(&daemon)); - /* if the daemon is me, then send direct to the target! */ if (ORTE_PROC_MY_NAME->vpid == daemon.vpid) { ret = target; @@ -444,28 +407,11 @@ static orte_process_name_t get_route(orte_process_name_t *target) return *ret; } -/* HANDLE ACK MESSAGES FROM AN HNP */ -static void release_ack(int fd, short event, void *data) -{ - orte_message_event_t *mev = (orte_message_event_t*)data; - ack_recvd = true; - OBJ_RELEASE(mev); -} - static void recv_ack(int status, orte_process_name_t* sender, opal_buffer_t* buffer, orte_rml_tag_t tag, void* cbdata) { - /* don't process this right away - we need to get out of the recv before - * we process the message as it may ask us to do something that involves - * more messaging! Instead, setup an event so that the message gets processed - * as soon as we leave the recv. - * - * The macro makes a copy of the buffer, which we release above - the incoming - * buffer, however, is NOT released here, although its payload IS transferred - * to the message buffer for later processing - */ - ORTE_MESSAGE_EVENT(sender, buffer, tag, release_ack); + ack_recvd = true; } @@ -581,6 +527,10 @@ static int init_routes(orte_jobid_t job, opal_buffer_t *ndat) /* set our lifeline to the HNP - we will abort if that connection is lost */ lifeline = ORTE_PROC_MY_HNP; + /* define our parent to be the HNP */ + ORTE_PROC_MY_PARENT->jobid = ORTE_PROC_MY_HNP->jobid; + ORTE_PROC_MY_PARENT->vpid = ORTE_PROC_MY_HNP->vpid; + /* daemons will send their contact info back to the HNP as * part of the message confirming they are read to go. HNP's * load their contact info during orte_init @@ -681,8 +631,10 @@ static int init_routes(orte_jobid_t job, opal_buffer_t *ndat) rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_UPDATE_ROUTE_ACK, ORTE_RML_NON_PERSISTENT, recv_ack, NULL); - ORTE_PROGRESSED_WAIT(ack_recvd, 0, 1); - + while (!ack_recvd) { + opal_progress(); + } + OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output, "%s routed_cm_init_routes: ack recvd", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); @@ -796,9 +748,10 @@ static int route_lost(const orte_process_name_t *route) if (!orte_finalizing && NULL != lifeline && OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, route, lifeline)) { - opal_output(0, "%s routed:cm: Connection to lifeline %s lost", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(lifeline)); + OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output, + "%s routed:cm: Connection to lifeline %s lost", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(lifeline))); return ORTE_ERR_FATAL; } @@ -824,77 +777,81 @@ static int set_lifeline(orte_process_name_t *proc) */ local_lifeline.jobid = proc->jobid; local_lifeline.vpid = proc->vpid; - ORTE_EPOCH_SET(local_lifeline.epoch,orte_ess.proc_get_epoch(&local_lifeline)); lifeline = &local_lifeline; return ORTE_SUCCESS; } -static int update_routing_tree(orte_jobid_t jobid) +static void update_routing_plan(void) { /* nothing to do here */ - return ORTE_SUCCESS; + return; } -static orte_vpid_t get_routing_tree(opal_list_t *children) +static void get_routing_list(orte_grpcomm_coll_t type, + orte_grpcomm_collective_t *coll) { - orte_routed_tree_t *nm; + orte_namelist_t *nm; int32_t i; orte_job_t *jdata; orte_proc_t *proc; - /* if I am anything other than a daemon or the HNP, this + /* if I am anything other than daemons and the HNP, this * is a meaningless command as I am not allowed to route */ - if (!ORTE_PROC_IS_DAEMON && !ORTE_PROC_IS_HNP) { - return ORTE_VPID_INVALID; + if (!ORTE_PROC_IS_DAEMON || !ORTE_PROC_IS_HNP) { + return; } - /* if I am a daemon, I do not have any children */ - if (ORTE_PROC_IS_DAEMON) { - return ORTE_PROC_MY_HNP->vpid; - } - - /* for the HNP, the cm routing tree is direct to all known alive daemons */ - if (NULL != children) { + if (ORTE_GRPCOMM_XCAST == type) { + /* daemons don't route */ + if (ORTE_PROC_IS_DAEMON) { + return; + } + /* HNP sends direct to each daemon */ if (NULL == (jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - return ORTE_ERR_NOT_FOUND; + return; } - - for(i = 0; i < jdata->procs->size; ++i) { + for (i=1; i < jdata->procs->size; i++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) { continue; } - if( proc->name.vpid == 0) { - continue; - } - if( proc->state <= ORTE_PROC_STATE_UNTERMINATED && NULL != proc->rml_uri ) { OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output, - "%s get_routing_tree: Adding process %15s state 0x%x", + "%s get_routing_tree: Adding process %s state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(proc->name)), - proc->state)); + orte_proc_state_to_str(proc->state))); - nm = OBJ_NEW(orte_routed_tree_t); - nm->vpid = proc->name.vpid; - opal_list_append(children, &nm->super); - } - else { + nm = OBJ_NEW(orte_namelist_t); + nm->name.jobid = proc->name.jobid; + nm->name.vpid = proc->name.vpid; + opal_list_append(&coll->targets, &nm->super); + } else { OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output, - "%s get_routing_tree: Skipped process %15s state 0x%x (non functional daemon)", + "%s get_routing_tree: Skipped process %15s state %s (non functional daemon)", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(proc->name)), - proc->state)); + orte_proc_state_to_str(proc->state))); } } + } else if (ORTE_GRPCOMM_COLL_RELAY == type) { + orte_routed_base_coll_relay_routing(coll); + } else if (ORTE_GRPCOMM_COLL_COMPLETE == type) { + orte_routed_base_coll_complete_routing(coll); + } else if (ORTE_GRPCOMM_COLL_PEERS == type) { + if (ORTE_PROC_IS_DAEMON) { + return; + } + /* HNP receives from all */ + nm = OBJ_NEW(orte_namelist_t); + nm->name.jobid = ORTE_PROC_MY_NAME->jobid; + nm->name.vpid = ORTE_VPID_WILDCARD; + opal_list_append(&coll->targets, &nm->super); } - - /* I have no parent */ - return ORTE_VPID_INVALID; } static int get_wireup_info(opal_buffer_t *buf) diff --git a/orte/mca/routed/debruijn/routed_debruijn.c b/orte/mca/routed/debruijn/routed_debruijn.c index dce3265c80..58d01ccf23 100644 --- a/orte/mca/routed/debruijn/routed_debruijn.c +++ b/orte/mca/routed/debruijn/routed_debruijn.c @@ -47,8 +47,9 @@ static orte_process_name_t get_route(orte_process_name_t *target); static int init_routes(orte_jobid_t job, opal_buffer_t *ndat); static int route_lost(const orte_process_name_t *route); static bool route_is_defined(const orte_process_name_t *target); -static int update_routing_tree(orte_jobid_t jobid); -static orte_vpid_t get_routing_tree(opal_list_t *children); +static void update_routing_plan(void); +static void get_routing_list(orte_grpcomm_coll_t type, + orte_grpcomm_collective_t *coll); static int get_wireup_info(opal_buffer_t *buf); static int set_lifeline(orte_process_name_t *proc); static size_t num_routes(void); @@ -67,8 +68,8 @@ orte_routed_module_t orte_routed_debruijn_module = { route_lost, route_is_defined, set_lifeline, - update_routing_tree, - get_routing_tree, + update_routing_plan, + get_routing_list, get_wireup_info, num_routes, #if OPAL_ENABLE_FT_CR == 1 @@ -133,14 +134,8 @@ static int delete_route(orte_process_name_t *proc) orte_routed_jobfam_t *jfam; uint16_t jfamily; -#if ORTE_ENABLE_EPOCH - if (proc->jobid == ORTE_JOBID_INVALID || - proc->vpid == ORTE_VPID_INVALID || - 0 == ORTE_EPOCH_CMP(proc->epoch,ORTE_EPOCH_INVALID)) { -#else if (proc->jobid == ORTE_JOBID_INVALID || proc->vpid == ORTE_VPID_INVALID) { -#endif return ORTE_ERR_BAD_PARAM; } @@ -207,14 +202,8 @@ static int update_route(orte_process_name_t *target, orte_routed_jobfam_t *jfam; uint16_t jfamily; -#if ORTE_ENABLE_EPOCH - if (target->jobid == ORTE_JOBID_INVALID || - target->vpid == ORTE_VPID_INVALID || - 0 == ORTE_EPOCH_CMP(target->epoch,ORTE_EPOCH_INVALID)) { -#else if (target->jobid == ORTE_JOBID_INVALID || target->vpid == ORTE_VPID_INVALID) { -#endif return ORTE_ERR_BAD_PARAM; } @@ -275,7 +264,6 @@ static int update_route(orte_process_name_t *target, ORTE_NAME_PRINT(route))); jfam->route.jobid = route->jobid; jfam->route.vpid = route->vpid; - ORTE_EPOCH_SET(jfam->route.epoch,orte_ess.proc_get_epoch(&jfam->route)); return ORTE_SUCCESS; } } @@ -289,7 +277,6 @@ static int update_route(orte_process_name_t *target, jfam->job_family = jfamily; jfam->route.jobid = route->jobid; jfam->route.vpid = route->vpid; - ORTE_EPOCH_SET(jfam->route.epoch,orte_ess.proc_get_epoch(&jfam->route)); opal_pointer_array_add(&orte_routed_jobfams, jfam); return ORTE_SUCCESS; } @@ -341,13 +328,6 @@ static orte_process_name_t get_route(orte_process_name_t *target) break; } -#if ORTE_ENABLE_EPOCH - if (0 == ORTE_EPOCH_CMP(target->epoch, ORTE_EPOCH_INVALID) || - 0 > ORTE_EPOCH_CMP(target->epoch, orte_ess.proc_get_epoch(target))) { - break; - } -#endif - /* if it is me, then the route is just direct */ if (OPAL_EQUAL == opal_dss.compare(ORTE_PROC_MY_NAME, target, ORTE_NAME)) { ret = *target; @@ -439,8 +419,6 @@ static orte_process_name_t get_route(orte_process_name_t *target) ret.vpid = debruijn_next_hop (ret.vpid); } while (0); - ORTE_EPOCH_SET(ret.epoch, orte_ess.proc_get_epoch(&ret)); - OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output, "%s routed_debruijn_get(%s) --> %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), @@ -451,27 +429,11 @@ static orte_process_name_t get_route(orte_process_name_t *target) } /* HANDLE ACK MESSAGES FROM AN HNP */ -static void release_ack(int fd, short event, void *data) -{ - orte_message_event_t *mev = (orte_message_event_t*)data; - ack_recvd = true; - OBJ_RELEASE(mev); -} - static void recv_ack(int status, orte_process_name_t* sender, opal_buffer_t* buffer, orte_rml_tag_t tag, void* cbdata) { - /* don't process this right away - we need to get out of the recv before - * we process the message as it may ask us to do something that involves - * more messaging! Instead, setup an event so that the message gets processed - * as soon as we leave the recv. - * - * The macro makes a copy of the buffer, which we release above - the incoming - * buffer, however, is NOT released here, although its payload IS transferred - * to the message buffer for later processing - */ - ORTE_MESSAGE_EVENT(sender, buffer, tag, release_ack); + ack_recvd = true; } @@ -641,7 +603,6 @@ static int init_routes(orte_jobid_t job, opal_buffer_t *ndat) rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_UPDATE_ROUTE_ACK, ORTE_RML_NON_PERSISTENT, recv_ack, NULL); - ORTE_PROGRESSED_WAIT(ack_recvd, 0, 1); OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output, "%s routed_debruijn_init_routes: ack recvd", @@ -773,9 +734,10 @@ static int route_lost(const orte_process_name_t *route) if (!orte_finalizing && NULL != lifeline && OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, route, lifeline)) { - opal_output(0, "%s routed:debruijn: Connection to lifeline %s lost", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(lifeline)); + OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output, + "%s routed:debruijn: Connection to lifeline %s lost", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(lifeline))); return ORTE_ERR_FATAL; } @@ -845,7 +807,6 @@ static int set_lifeline(orte_process_name_t *proc) */ local_lifeline.jobid = proc->jobid; local_lifeline.vpid = proc->vpid; - ORTE_EPOCH_SET(local_lifeline.epoch,proc->epoch); lifeline = &local_lifeline; return ORTE_SUCCESS; @@ -868,7 +829,7 @@ static unsigned int ilog2 (unsigned int v) return r; } -static int update_routing_tree(orte_jobid_t jobid) +static void update_routing_plan(void) { orte_routed_tree_t *child; opal_list_item_t *item; @@ -879,7 +840,7 @@ static int update_routing_tree(orte_jobid_t jobid) * is a meaningless command as I am not allowed to route */ if (!ORTE_PROC_IS_DAEMON && !ORTE_PROC_IS_HNP) { - return ORTE_ERR_NOT_SUPPORTED; + return; } /* clear the list of children if any are already present */ @@ -920,41 +881,27 @@ static int update_routing_tree(orte_jobid_t jobid) } } } - - ORTE_EPOCH_SET(ORTE_PROC_MY_PARENT->epoch,orte_ess.proc_get_epoch(ORTE_PROC_MY_PARENT)); - - return ORTE_SUCCESS; } -static orte_vpid_t get_routing_tree(opal_list_t *children) +static void get_routing_list(orte_grpcomm_coll_t type, + orte_grpcomm_collective_t *coll) { - orte_routed_tree_t *child; - opal_list_item_t *item; - orte_routed_tree_t *nm; - /* if I am anything other than a daemon or the HNP, this * is a meaningless command as I am not allowed to route */ if (!ORTE_PROC_IS_DAEMON && !ORTE_PROC_IS_HNP) { - return ORTE_VPID_INVALID; + return; } - /* the debruijn routing tree always goes to our children - * for any job - */ - if (NULL != children) { - for (item = opal_list_get_first(&my_children); - item != opal_list_get_end(&my_children); - item = opal_list_get_next(item)) { - child = (orte_routed_tree_t *) item; - nm = OBJ_NEW(orte_routed_tree_t); - nm->vpid = child->vpid; - opal_list_append(children, &nm->super); - } + if (ORTE_GRPCOMM_XCAST == type) { + orte_routed_base_xcast_routing(coll, &my_children); + } else if (ORTE_GRPCOMM_COLL_RELAY == type) { + orte_routed_base_coll_relay_routing(coll); + } else if (ORTE_GRPCOMM_COLL_COMPLETE == type) { + orte_routed_base_coll_complete_routing(coll); + } else if (ORTE_GRPCOMM_COLL_PEERS == type) { + orte_routed_base_coll_peers(coll, &my_children); } - - /* return my parent's vpid */ - return ORTE_PROC_MY_PARENT->vpid; } static int get_wireup_info(opal_buffer_t *buf) diff --git a/orte/mca/routed/direct/routed_direct.c b/orte/mca/routed/direct/routed_direct.c index 739072d994..3b9409d407 100644 --- a/orte/mca/routed/direct/routed_direct.c +++ b/orte/mca/routed/direct/routed_direct.c @@ -39,8 +39,9 @@ static orte_process_name_t get_route(orte_process_name_t *target); static int init_routes(orte_jobid_t job, opal_buffer_t *ndat); static int route_lost(const orte_process_name_t *route); static bool route_is_defined(const orte_process_name_t *target); -static int update_routing_tree(orte_jobid_t jobid); -static orte_vpid_t get_routing_tree(opal_list_t *children); +static void update_routing_plan(void); +static void get_routing_list(orte_grpcomm_coll_t type, + orte_grpcomm_collective_t *coll); static int get_wireup_info(opal_buffer_t *buf); static int set_lifeline(orte_process_name_t *proc); static size_t num_routes(void); @@ -59,8 +60,8 @@ orte_routed_module_t orte_routed_direct_module = { route_lost, route_is_defined, set_lifeline, - update_routing_tree, - get_routing_tree, + update_routing_plan, + get_routing_list, get_wireup_info, num_routes, #if OPAL_ENABLE_FT_CR == 1 @@ -133,19 +134,8 @@ static orte_process_name_t get_route(orte_process_name_t *target) { orte_process_name_t *ret; -#if ORTE_ENABLE_EPOCH - if (target->jobid == ORTE_JOBID_INVALID || - target->vpid == ORTE_VPID_INVALID || - 0 == ORTE_EPOCH_CMP(target->epoch,ORTE_EPOCH_INVALID)) { -#else if (target->jobid == ORTE_JOBID_INVALID || target->vpid == ORTE_VPID_INVALID) { -#endif - ret = ORTE_NAME_INVALID; - goto found; - } - - if (0 > ORTE_EPOCH_CMP(target->epoch, orte_ess.proc_get_epoch(target))) { ret = ORTE_NAME_INVALID; goto found; } @@ -321,30 +311,75 @@ static int set_lifeline(orte_process_name_t *proc) return ORTE_SUCCESS; } -static int update_routing_tree(orte_jobid_t jobid) +static void update_routing_plan(void) { /* nothing to do here */ - return ORTE_SUCCESS; + return; } -static orte_vpid_t get_routing_tree(opal_list_t *children) +static void get_routing_list(orte_grpcomm_coll_t type, + orte_grpcomm_collective_t *coll) { - orte_vpid_t i; - orte_routed_tree_t *nm; + orte_namelist_t *nm; + int32_t i; + orte_job_t *jdata; + orte_proc_t *proc; - if (!ORTE_PROC_IS_HNP) { - /* if I am not the HNP, there is nothing to do */ - return ORTE_VPID_INVALID; - } - - /* if I am the HNP, then I need to construct a list containing all - * daemons so I can relay messages to them + /* if I am anything other than daemons and the HNP, this + * is a meaningless command as I am not allowed to route */ - for (i=0; i < orte_process_info.num_procs; i++) { - nm = OBJ_NEW(orte_routed_tree_t); - nm->vpid = i; + if (!ORTE_PROC_IS_DAEMON || !ORTE_PROC_IS_HNP) { + return; + } + + if (ORTE_GRPCOMM_XCAST == type) { + /* daemons don't route */ + if (ORTE_PROC_IS_DAEMON) { + return; + } + /* HNP sends direct to each daemon */ + if (NULL == (jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + return; + } + for (i=1; i < jdata->procs->size; i++) { + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) { + continue; + } + if( proc->state <= ORTE_PROC_STATE_UNTERMINATED && + NULL != proc->rml_uri ) { + OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output, + "%s get_routing_tree: Adding process %s state %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&(proc->name)), + orte_proc_state_to_str(proc->state))); + + nm = OBJ_NEW(orte_namelist_t); + nm->name.jobid = proc->name.jobid; + nm->name.vpid = proc->name.vpid; + opal_list_append(&coll->targets, &nm->super); + } else { + OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output, + "%s get_routing_tree: Skipped process %15s state %s (non functional daemon)", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&(proc->name)), + orte_proc_state_to_str(proc->state))); + } + } + } else if (ORTE_GRPCOMM_COLL_RELAY == type) { + orte_routed_base_coll_relay_routing(coll); + } else if (ORTE_GRPCOMM_COLL_COMPLETE == type) { + orte_routed_base_coll_complete_routing(coll); + } else if (ORTE_GRPCOMM_COLL_PEERS == type) { + if (ORTE_PROC_IS_DAEMON) { + return; + } + /* HNP receives from all */ + nm = OBJ_NEW(orte_namelist_t); + nm->name.jobid = ORTE_PROC_MY_NAME->jobid; + nm->name.vpid = ORTE_VPID_WILDCARD; + opal_list_append(&coll->targets, &nm->super); } - return ORTE_VPID_INVALID; } static int get_wireup_info(opal_buffer_t *buf) diff --git a/orte/mca/routed/linear/routed_linear.c b/orte/mca/routed/linear/routed_linear.c index 49f75428a1..2929c40caf 100644 --- a/orte/mca/routed/linear/routed_linear.c +++ b/orte/mca/routed/linear/routed_linear.c @@ -4,6 +4,8 @@ * Copyright (c) 2004-2011 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -16,10 +18,10 @@ #include -#include "opal/threads/condition.h" #include "opal/dss/dss.h" #include "opal/class/opal_bitmap.h" #include "opal/class/opal_hash_table.h" +#include "opal/runtime/opal_progress.h" #include "opal/util/output.h" #include "orte/mca/errmgr/errmgr.h" @@ -46,8 +48,9 @@ static orte_process_name_t get_route(orte_process_name_t *target); static int init_routes(orte_jobid_t job, opal_buffer_t *ndat); static int route_lost(const orte_process_name_t *route); static bool route_is_defined(const orte_process_name_t *target); -static int update_routing_tree(orte_jobid_t jobid); -static orte_vpid_t get_routing_tree(opal_list_t *children); +static void update_routing_plan(void); +static void get_routing_list(orte_grpcomm_coll_t type, + orte_grpcomm_collective_t *coll); static int get_wireup_info(opal_buffer_t *buf); static int set_lifeline(orte_process_name_t *proc); static size_t num_routes(void); @@ -66,8 +69,8 @@ orte_routed_module_t orte_routed_linear_module = { route_lost, route_is_defined, set_lifeline, - update_routing_tree, - get_routing_tree, + update_routing_plan, + get_routing_list, get_wireup_info, num_routes, #if OPAL_ENABLE_FT_CR == 1 @@ -78,8 +81,6 @@ orte_routed_module_t orte_routed_linear_module = { }; /* local globals */ -static opal_condition_t cond; -static opal_mutex_t lock; static orte_process_name_t *lifeline=NULL; static orte_process_name_t local_lifeline; static bool ack_recvd; @@ -88,10 +89,6 @@ static bool hnp_direct=true; static int init(void) { - /* setup the global condition and lock */ - OBJ_CONSTRUCT(&cond, opal_condition_t); - OBJ_CONSTRUCT(&lock, opal_mutex_t); - ORTE_PROC_MY_PARENT->jobid = ORTE_PROC_MY_NAME->jobid; lifeline = NULL; @@ -115,10 +112,6 @@ static int finalize(void) } } - /* destruct the global condition and lock */ - OBJ_DESTRUCT(&cond); - OBJ_DESTRUCT(&lock); - lifeline = NULL; return ORTE_SUCCESS; @@ -130,14 +123,8 @@ static int delete_route(orte_process_name_t *proc) orte_routed_jobfam_t *jfam; uint16_t jfamily; -#if ORTE_ENABLE_EPOCH - if (proc->jobid == ORTE_JOBID_INVALID || - proc->vpid == ORTE_VPID_INVALID || - 0 == ORTE_EPOCH_CMP(proc->epoch,ORTE_EPOCH_INVALID)) { -#else if (proc->jobid == ORTE_JOBID_INVALID || proc->vpid == ORTE_VPID_INVALID) { -#endif return ORTE_ERR_BAD_PARAM; } @@ -204,14 +191,8 @@ static int update_route(orte_process_name_t *target, orte_routed_jobfam_t *jfam; uint16_t jfamily; -#if ORTE_ENABLE_EPOCH - if (target->jobid == ORTE_JOBID_INVALID || - target->vpid == ORTE_VPID_INVALID || - 0 == ORTE_EPOCH_CMP(target->epoch,ORTE_EPOCH_INVALID)) { -#else if (target->jobid == ORTE_JOBID_INVALID || target->vpid == ORTE_VPID_INVALID) { -#endif return ORTE_ERR_BAD_PARAM; } @@ -272,7 +253,6 @@ static int update_route(orte_process_name_t *target, ORTE_NAME_PRINT(route))); jfam->route.jobid = route->jobid; jfam->route.vpid = route->vpid; - ORTE_EPOCH_SET(jfam->route.epoch,route->epoch); return ORTE_SUCCESS; } } @@ -286,7 +266,6 @@ static int update_route(orte_process_name_t *target, jfam->job_family = jfamily; jfam->route.jobid = route->jobid; jfam->route.vpid = route->vpid; - ORTE_EPOCH_SET(jfam->route.epoch,route->epoch); opal_pointer_array_add(&orte_routed_jobfams, jfam); return ORTE_SUCCESS; } @@ -311,19 +290,8 @@ static orte_process_name_t get_route(orte_process_name_t *target) goto found; } -#if ORTE_ENABLE_EPOCH - if (target->jobid == ORTE_JOBID_INVALID || - target->vpid == ORTE_VPID_INVALID || - 0 == ORTE_EPOCH_CMP(target->epoch,ORTE_EPOCH_INVALID)) { -#else if (target->jobid == ORTE_JOBID_INVALID || target->vpid == ORTE_VPID_INVALID) { -#endif - ret = ORTE_NAME_INVALID; - goto found; - } - - if (0 > ORTE_EPOCH_CMP(target->epoch, orte_ess.proc_get_epoch(target))) { ret = ORTE_NAME_INVALID; goto found; } @@ -399,9 +367,6 @@ static orte_process_name_t get_route(orte_process_name_t *target) goto found; } - /* Initialize daemon's epoch, based on its current vpid/jobid */ - ORTE_EPOCH_SET(daemon.epoch,orte_ess.proc_get_epoch(&daemon)); - /* if the daemon is me, then send direct to the target! */ if (ORTE_PROC_MY_NAME->vpid == daemon.vpid) { ret = target; @@ -421,7 +386,6 @@ static orte_process_name_t get_route(orte_process_name_t *target) /* we are at end of chain - wrap around */ daemon.vpid = 0; } - ORTE_EPOCH_SET(daemon.epoch,orte_ess.proc_get_epoch(&daemon)); ret = &daemon; } } @@ -436,28 +400,11 @@ static orte_process_name_t get_route(orte_process_name_t *target) return *ret; } -/* HANDLE ACK MESSAGES FROM AN HNP */ -static void release_ack(int fd, short event, void *data) -{ - orte_message_event_t *mev = (orte_message_event_t*)data; - ack_recvd = true; - OBJ_RELEASE(mev); -} - static void recv_ack(int status, orte_process_name_t* sender, opal_buffer_t* buffer, orte_rml_tag_t tag, void* cbdata) { - /* don't process this right away - we need to get out of the recv before - * we process the message as it may ask us to do something that involves - * more messaging! Instead, setup an event so that the message gets processed - * as soon as we leave the recv. - * - * The macro makes a copy of the buffer, which we release above - the incoming - * buffer, however, is NOT released here, although its payload IS transferred - * to the message buffer for later processing - */ - ORTE_MESSAGE_EVENT(sender, buffer, tag, release_ack); + ack_recvd = true; } @@ -627,7 +574,9 @@ static int init_routes(orte_jobid_t job, opal_buffer_t *ndat) rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_UPDATE_ROUTE_ACK, ORTE_RML_NON_PERSISTENT, recv_ack, NULL); - ORTE_PROGRESSED_WAIT(ack_recvd, 0, 1); + while (!ack_recvd) { + opal_progress(); + } OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output, "%s routed_linear_init_routes: ack recvd", @@ -728,9 +677,10 @@ static int route_lost(const orte_process_name_t *route) if (!orte_finalizing && NULL != lifeline && OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, route, lifeline)) { - opal_output(0, "%s routed:linear: Connection to lifeline %s lost", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(lifeline)); + OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output, + "%s routed:linear: Connection to lifeline %s lost", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(lifeline))); return ORTE_ERR_FATAL; } @@ -757,19 +707,18 @@ static int set_lifeline(orte_process_name_t *proc) */ local_lifeline.jobid = proc->jobid; local_lifeline.vpid = proc->vpid; - ORTE_EPOCH_SET(local_lifeline.epoch,proc->epoch); lifeline = &local_lifeline; return ORTE_SUCCESS; } -static int update_routing_tree(orte_jobid_t jobid) +static void update_routing_plan(void) { /* if I am anything other than a daemon or the HNP, this * is a meaningless command as I am not allowed to route */ if (!ORTE_PROC_IS_DAEMON && !ORTE_PROC_IS_HNP) { - return ORTE_ERR_NOT_SUPPORTED; + return; } /* my parent is the my_vpid-1 daemon */ @@ -778,45 +727,50 @@ static int update_routing_tree(orte_jobid_t jobid) } /* nothing to do here as the routing tree is fixed */ - return ORTE_SUCCESS; + return; } -static orte_vpid_t get_routing_tree(opal_list_t *children) +static void get_routing_list(orte_grpcomm_coll_t type, + orte_grpcomm_collective_t *coll) { orte_routed_tree_t *nm; - orte_vpid_t v; + opal_list_t my_children; /* if I am anything other than a daemon or the HNP, this * is a meaningless command as I am not allowed to route */ if (!ORTE_PROC_IS_DAEMON && !ORTE_PROC_IS_HNP) { - return ORTE_VPID_INVALID; + return; } - /* the linear routing tree consists of a chain of daemons - * extending from the HNP to orte_process_info.num_procs-1. - * Accordingly, my child is just the my_vpid+1 daemon - */ - if (NULL != children && - ORTE_PROC_MY_NAME->vpid < orte_process_info.num_procs-1) { - /* my child is just the vpid+1 daemon */ - nm = OBJ_NEW(orte_routed_tree_t); - nm->vpid = ORTE_PROC_MY_NAME->vpid + 1; - opal_bitmap_init(&nm->relatives, orte_process_info.num_procs); - /* my relatives are everyone above that point */ - for (v=nm->vpid+1; v < orte_process_info.num_procs; v++) { - opal_bitmap_set_bit(&nm->relatives, v); + if (ORTE_GRPCOMM_XCAST == type) { + /* the linear routing tree consists of a chain of daemons + * extending from the HNP to orte_process_info.num_procs-1. + * Accordingly, my child is just the my_vpid+1 daemon + */ + OBJ_CONSTRUCT(&my_children, opal_list_t); + if (ORTE_PROC_MY_NAME->vpid < orte_process_info.num_procs-1) { + nm = OBJ_NEW(orte_routed_tree_t); + nm->vpid = ORTE_PROC_MY_NAME->vpid + 1; + opal_list_append(&my_children, &nm->super); + } + orte_routed_base_xcast_routing(coll, &my_children); + if (ORTE_PROC_MY_NAME->vpid < orte_process_info.num_procs-1) { + opal_list_remove_first(&my_children); + OBJ_RELEASE(nm); + } + OBJ_DESTRUCT(&my_children); + } else if (ORTE_GRPCOMM_COLL_RELAY == type) { + orte_routed_base_coll_relay_routing(coll); + } else if (ORTE_GRPCOMM_COLL_COMPLETE == type) { + orte_routed_base_coll_complete_routing(coll); + } else if (ORTE_GRPCOMM_COLL_PEERS == type) { + if (ORTE_PROC_MY_NAME->vpid < orte_process_info.num_procs-1) { + nm = OBJ_NEW(orte_routed_tree_t); + nm->vpid = ORTE_PROC_MY_NAME->vpid + 1; + opal_list_append(&coll->targets, &nm->super); } - opal_list_append(children, &nm->super); } - - if (ORTE_PROC_IS_HNP) { - /* the parent of the HNP is invalid */ - return ORTE_VPID_INVALID; - } - - /* my parent is the my_vpid-1 daemon */ - return (ORTE_PROC_MY_NAME->vpid - 1); } diff --git a/orte/mca/routed/radix/routed_radix.c b/orte/mca/routed/radix/routed_radix.c index 243a5aa0db..f19d6ea456 100644 --- a/orte/mca/routed/radix/routed_radix.c +++ b/orte/mca/routed/radix/routed_radix.c @@ -4,6 +4,8 @@ * Copyright (c) 2004-2011 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -16,10 +18,10 @@ #include -#include "opal/threads/condition.h" #include "opal/dss/dss.h" #include "opal/class/opal_hash_table.h" #include "opal/class/opal_bitmap.h" +#include "opal/runtime/opal_progress.h" #include "opal/util/output.h" #include "orte/mca/errmgr/errmgr.h" @@ -47,8 +49,9 @@ static orte_process_name_t get_route(orte_process_name_t *target); static int init_routes(orte_jobid_t job, opal_buffer_t *ndat); static int route_lost(const orte_process_name_t *route); static bool route_is_defined(const orte_process_name_t *target); -static int update_routing_tree(orte_jobid_t jobid); -static orte_vpid_t get_routing_tree(opal_list_t *children); +static void update_routing_plan(void); +static void get_routing_list(orte_grpcomm_coll_t type, + orte_grpcomm_collective_t *coll); static int get_wireup_info(opal_buffer_t *buf); static int set_lifeline(orte_process_name_t *proc); static size_t num_routes(void); @@ -67,8 +70,8 @@ orte_routed_module_t orte_routed_radix_module = { route_lost, route_is_defined, set_lifeline, - update_routing_tree, - get_routing_tree, + update_routing_plan, + get_routing_list, get_wireup_info, num_routes, #if OPAL_ENABLE_FT_CR == 1 @@ -79,8 +82,6 @@ orte_routed_module_t orte_routed_radix_module = { }; /* local globals */ -static opal_condition_t cond; -static opal_mutex_t lock; static orte_process_name_t *lifeline=NULL; static orte_process_name_t local_lifeline; static int num_children; @@ -90,10 +91,6 @@ static bool hnp_direct=true; static int init(void) { - /* setup the global condition and lock */ - OBJ_CONSTRUCT(&cond, opal_condition_t); - OBJ_CONSTRUCT(&lock, opal_mutex_t); - lifeline = NULL; /* setup the list of children */ @@ -121,10 +118,6 @@ static int finalize(void) } } - /* destruct the global condition and lock */ - OBJ_DESTRUCT(&cond); - OBJ_DESTRUCT(&lock); - lifeline = NULL; /* deconstruct the list of children */ @@ -143,14 +136,8 @@ static int delete_route(orte_process_name_t *proc) orte_routed_jobfam_t *jfam; uint16_t jfamily; -#if ORTE_ENABLE_EPOCH - if (proc->jobid == ORTE_JOBID_INVALID || - proc->vpid == ORTE_VPID_INVALID || - 0 == ORTE_EPOCH_CMP(proc->epoch,ORTE_EPOCH_INVALID)) { -#else if (proc->jobid == ORTE_JOBID_INVALID || proc->vpid == ORTE_VPID_INVALID) { -#endif return ORTE_ERR_BAD_PARAM; } @@ -217,14 +204,8 @@ static int update_route(orte_process_name_t *target, orte_routed_jobfam_t *jfam; uint16_t jfamily; -#if ORTE_ENABLE_EPOCH - if (target->jobid == ORTE_JOBID_INVALID || - target->vpid == ORTE_VPID_INVALID || - 0 == ORTE_EPOCH_CMP(target->epoch,ORTE_EPOCH_INVALID)) { -#else if (target->jobid == ORTE_JOBID_INVALID || target->vpid == ORTE_VPID_INVALID) { -#endif return ORTE_ERR_BAD_PARAM; } @@ -285,7 +266,6 @@ static int update_route(orte_process_name_t *target, ORTE_NAME_PRINT(route))); jfam->route.jobid = route->jobid; jfam->route.vpid = route->vpid; - ORTE_EPOCH_SET(jfam->route.epoch,orte_ess.proc_get_epoch(&jfam->route)); return ORTE_SUCCESS; } } @@ -299,7 +279,6 @@ static int update_route(orte_process_name_t *target, jfam->job_family = jfamily; jfam->route.jobid = route->jobid; jfam->route.vpid = route->vpid; - ORTE_EPOCH_SET(jfam->route.epoch,orte_ess.proc_get_epoch(&jfam->route)); opal_pointer_array_add(&orte_routed_jobfams, jfam); return ORTE_SUCCESS; } @@ -329,21 +308,9 @@ static orte_process_name_t get_route(orte_process_name_t *target) /* initialize */ daemon.jobid = ORTE_PROC_MY_DAEMON->jobid; daemon.vpid = ORTE_PROC_MY_DAEMON->vpid; - ORTE_EPOCH_SET(daemon.epoch,ORTE_PROC_MY_DAEMON->epoch); -#if ORTE_ENABLE_EPOCH - if (target->jobid == ORTE_JOBID_INVALID || - target->vpid == ORTE_VPID_INVALID || - 0 == ORTE_EPOCH_CMP(target->epoch,ORTE_EPOCH_INVALID)) { -#else if (target->jobid == ORTE_JOBID_INVALID || target->vpid == ORTE_VPID_INVALID) { -#endif - ret = ORTE_NAME_INVALID; - goto found; - } - - if (0 > ORTE_EPOCH_CMP(target->epoch, orte_ess.proc_get_epoch(target))) { ret = ORTE_NAME_INVALID; goto found; } @@ -452,7 +419,6 @@ static orte_process_name_t get_route(orte_process_name_t *target) if (opal_bitmap_is_set_bit(&child->relatives, daemon.vpid)) { /* yep - we need to step through this child */ daemon.vpid = child->vpid; - ORTE_EPOCH_SET(daemon.epoch,orte_ess.proc_get_epoch(&daemon)); ret = &daemon; goto found; } @@ -463,7 +429,6 @@ static orte_process_name_t get_route(orte_process_name_t *target) * any of our children, so we have to step up through our parent */ daemon.vpid = ORTE_PROC_MY_PARENT->vpid; - ORTE_EPOCH_SET(daemon.epoch,orte_ess.proc_get_epoch(&daemon)); ret = &daemon; @@ -477,28 +442,11 @@ found: return *ret; } -/* HANDLE ACK MESSAGES FROM AN HNP */ -static void release_ack(int fd, short event, void *data) -{ - orte_message_event_t *mev = (orte_message_event_t*)data; - ack_recvd = true; - OBJ_RELEASE(mev); -} - static void recv_ack(int status, orte_process_name_t* sender, opal_buffer_t* buffer, orte_rml_tag_t tag, void* cbdata) { - /* don't process this right away - we need to get out of the recv before - * we process the message as it may ask us to do something that involves - * more messaging! Instead, setup an event so that the message gets processed - * as soon as we leave the recv. - * - * The macro makes a copy of the buffer, which we release above - the incoming - * buffer, however, is NOT released here, although its payload IS transferred - * to the message buffer for later processing - */ - ORTE_MESSAGE_EVENT(sender, buffer, tag, release_ack); + ack_recvd = true; } @@ -668,8 +616,10 @@ static int init_routes(orte_jobid_t job, opal_buffer_t *ndat) rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_UPDATE_ROUTE_ACK, ORTE_RML_NON_PERSISTENT, recv_ack, NULL); - ORTE_PROGRESSED_WAIT(ack_recvd, 0, 1); - + while (!ack_recvd) { + opal_progress(); + } + OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output, "%s routed_radix_init_routes: ack recvd", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); @@ -800,9 +750,10 @@ static int route_lost(const orte_process_name_t *route) if (!orte_finalizing && NULL != lifeline && OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, route, lifeline)) { - opal_output(0, "%s routed:radix: Connection to lifeline %s lost", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(lifeline)); + OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output, + "%s routed:radix: Connection to lifeline %s lost", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(lifeline))); return ORTE_ERR_FATAL; } @@ -872,7 +823,6 @@ static int set_lifeline(orte_process_name_t *proc) */ local_lifeline.jobid = proc->jobid; local_lifeline.vpid = proc->vpid; - ORTE_EPOCH_SET(local_lifeline.epoch,proc->epoch); lifeline = &local_lifeline; return ORTE_SUCCESS; @@ -923,7 +873,7 @@ static void radix_tree(int rank, int *num_children, } } -static int update_routing_tree(orte_jobid_t jobid) +static void update_routing_plan(void) { orte_routed_tree_t *child; int j; @@ -935,7 +885,7 @@ static int update_routing_tree(orte_jobid_t jobid) * is a meaningless command as I am not allowed to route */ if (!ORTE_PROC_IS_DAEMON && !ORTE_PROC_IS_HNP) { - return ORTE_ERR_NOT_SUPPORTED; + return; } /* clear the list of children if any are already present */ @@ -965,7 +915,6 @@ static int update_routing_tree(orte_jobid_t jobid) ORTE_PROC_MY_PARENT->vpid = (Ii-Sum) % NInPrevLevel; ORTE_PROC_MY_PARENT->vpid += (Sum - NInPrevLevel); } - ORTE_EPOCH_SET(ORTE_PROC_MY_PARENT->epoch,orte_ess.proc_get_epoch(ORTE_PROC_MY_PARENT)); /* compute my direct children and the bitmap that shows which vpids * lie underneath their branch @@ -986,39 +935,27 @@ static int update_routing_tree(orte_jobid_t jobid) } } } - - return ORTE_SUCCESS; } -static orte_vpid_t get_routing_tree(opal_list_t *children) +static void get_routing_list(orte_grpcomm_coll_t type, + orte_grpcomm_collective_t *coll) { - opal_list_item_t *item; - orte_routed_tree_t *child; - orte_routed_tree_t *nm; - /* if I am anything other than a daemon or the HNP, this * is a meaningless command as I am not allowed to route */ if (!ORTE_PROC_IS_DAEMON && !ORTE_PROC_IS_HNP) { - return ORTE_VPID_INVALID; + return; } - /* the radix routing tree always goes to our children, - * for any job - */ - if (NULL != children) { - for (item = opal_list_get_first(&my_children); - item != opal_list_get_end(&my_children); - item = opal_list_get_next(item)) { - child = (orte_routed_tree_t*)item; - nm = OBJ_NEW(orte_routed_tree_t); - nm->vpid = child->vpid; - opal_bitmap_copy(&nm->relatives, &child->relatives); - opal_list_append(children, &nm->super); - } + if (ORTE_GRPCOMM_XCAST == type) { + orte_routed_base_xcast_routing(coll, &my_children); + } else if (ORTE_GRPCOMM_COLL_RELAY == type) { + orte_routed_base_coll_relay_routing(coll); + } else if (ORTE_GRPCOMM_COLL_COMPLETE == type) { + orte_routed_base_coll_complete_routing(coll); + } else if (ORTE_GRPCOMM_COLL_PEERS == type) { + orte_routed_base_coll_peers(coll, &my_children); } - /* return my parent's vpid */ - return ORTE_PROC_MY_PARENT->vpid; } static int get_wireup_info(opal_buffer_t *buf) diff --git a/orte/mca/routed/routed.h b/orte/mca/routed/routed.h index cba46798b4..99d5d2fead 100644 --- a/orte/mca/routed/routed.h +++ b/orte/mca/routed/routed.h @@ -40,6 +40,8 @@ #include "opal/mca/crs/crs.h" #include "opal/mca/crs/base/base.h" +#include "orte/mca/grpcomm/grpcomm_types.h" + #include "orte/mca/routed/routed_types.h" BEGIN_C_DECLS @@ -188,29 +190,23 @@ typedef bool (*orte_routed_module_route_is_defined_fn_t)(const orte_process_name typedef int (*orte_routed_module_get_wireup_info_fn_t)(opal_buffer_t *buf); /* - * Update the module's routing tree for this process + * Update the module's routing plan * - * Called only by a daemon and the HNP, this function creates a list - * of "leaves" for this process and identifies the vpid of the parent - * sitting above this process in the tree. - * - * @param [in] jobid The jobid of the routing tree that needs to be updated. - * - * @retval ORTE_SUCCESS The operation completed successfully - * @retval ORTE_ERROR_xxx The specifed error occurred + * Called only by a daemon and the HNP, this function creates a plan + * for routing messages within ORTE, especially for routing collectives + * used during wireup */ -typedef int (*orte_routed_module_update_routing_tree_fn_t)(orte_jobid_t jobid); +typedef void (*orte_routed_module_update_routing_plan_fn_t)(void); /* - * Get the routing tree for this process + * Get the routing list for the specified collective * - * Fills the provided list with the direct children of this process - * in the routing tree, and returns the vpid of the parent. Only valid - * when called by a daemon or the HNP. Passing a NULL pointer will result - * in only the parent vpid being returned. The returned list will be filled - * with orte_routed_tree_t items. + * Fills the target list with names for the given collective so that + * the grpcomm framework will know who to send the collective to + * next */ -typedef orte_vpid_t (*orte_routed_module_get_routing_tree_fn_t)(opal_list_t *children); +typedef void (*orte_routed_module_get_routing_list_fn_t)(orte_grpcomm_coll_t type, + orte_grpcomm_collective_t *coll); /* * Set lifeline process @@ -261,8 +257,8 @@ struct orte_routed_module_t { orte_routed_module_route_is_defined_fn_t route_is_defined; orte_routed_module_set_lifeline_fn_t set_lifeline; /* fns for daemons */ - orte_routed_module_update_routing_tree_fn_t update_routing_tree; - orte_routed_module_get_routing_tree_fn_t get_routing_tree; + orte_routed_module_update_routing_plan_fn_t update_routing_plan; + orte_routed_module_get_routing_list_fn_t get_routing_list; orte_routed_module_get_wireup_info_fn_t get_wireup_info; orte_routed_module_num_routes_fn_t num_routes; /* FT Notification */ diff --git a/orte/mca/routed/routed_types.h b/orte/mca/routed/routed_types.h index 161d894a8b..9eed54abad 100644 --- a/orte/mca/routed/routed_types.h +++ b/orte/mca/routed/routed_types.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008 Los Alamos National Security, LLC. + * Copyright (c) 2008-2012 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2004-2008 The Trustees of Indiana University. * All rights reserved. diff --git a/orte/mca/sensor/file/sensor_file.c b/orte/mca/sensor/file/sensor_file.c index 3193c4f67d..27917a682c 100644 --- a/orte/mca/sensor/file/sensor_file.c +++ b/orte/mca/sensor/file/sensor_file.c @@ -3,6 +3,8 @@ * Copyright (c) 2004-2011 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. + * All rights reserved. * * $COPYRIGHT$ * @@ -43,6 +45,7 @@ #include "orte/util/show_help.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/odls/odls_types.h" +#include "orte/mca/state/state.h" #include "orte/runtime/orte_wait.h" #include "orte/util/name_fns.h" #include "orte/runtime/orte_globals.h" @@ -70,9 +73,6 @@ typedef struct { opal_list_item_t super; orte_jobid_t jobid; orte_vpid_t vpid; -#if ORTE_ENABLE_EPOCH - orte_epoch_t epoch; -#endif char *file; int tick; bool check_size; @@ -138,8 +138,7 @@ static void finalize(void) static void start(orte_jobid_t jobid) { mca_base_component_t *c = &mca_sensor_file_component.super.base_version; - opal_list_item_t *item; - orte_odls_job_t *jobdat; + orte_job_t *jobdat; orte_app_context_t *app, *aptr; int rc, tmp; char *filename; @@ -156,87 +155,85 @@ static void start(orte_jobid_t jobid) ORTE_JOBID_PRINT(jobid))); /* get the local jobdat for this job */ - for (item = opal_list_get_first(&orte_local_jobdata); - item != opal_list_get_end(&orte_local_jobdata); - item = opal_list_get_end(&orte_local_jobdata)) { - jobdat = (orte_odls_job_t*)item; - if (jobid == jobdat->jobid || ORTE_JOBID_WILDCARD == jobid) { - /* must be at least one app_context, so use the first one found */ - app = NULL; - for (tmp=0; tmp < jobdat->apps.size; tmp++) { - if (NULL != (aptr = (orte_app_context_t*)opal_pointer_array_get_item(&jobdat->apps, tmp))) { - app = aptr; - break; - } - } - if (NULL == app) { - /* got a problem */ - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - continue; - } - - /* search the environ to get the filename */ - if (ORTE_SUCCESS != (rc = mca_base_param_find_string(c, "filename", app->env, &filename))) { - /* was a default file given */ - if (NULL == mca_sensor_file_component.file) { - /* can't do anything without a file */ - OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output, - "%s sensor:file no file for job %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(jobid))); - continue; - } - filename = mca_sensor_file_component.file; - } - - /* create the tracking object */ - ft = OBJ_NEW(file_tracker_t); - ft->jobid = jobid; - ft->file = strdup(filename); - - /* search the environ to see what we are checking */ - tmp = 0; - if (ORTE_SUCCESS != (rc = mca_base_param_find_int(c, "check_size", app->env, &tmp))) { - /* was a default value given */ - if (0 < mca_sensor_file_component.check_size) { - ft->check_size = OPAL_INT_TO_BOOL(mca_sensor_file_component.check_size); - } - } else { - ft->check_size = OPAL_INT_TO_BOOL(tmp); - } - tmp = 0; - if (ORTE_SUCCESS != (rc = mca_base_param_find_int(c, "check_access", app->env, &tmp))) { - /* was a default value given */ - if (0 < mca_sensor_file_component.check_access) { - ft->check_access = OPAL_INT_TO_BOOL(mca_sensor_file_component.check_access); - } - } else { - ft->check_access = OPAL_INT_TO_BOOL(tmp); - } - tmp = 0; - if (ORTE_SUCCESS != (rc = mca_base_param_find_int(c, "check_mod", app->env, &tmp))) { - /* was a default value given */ - if (0 < mca_sensor_file_component.check_mod) { - ft->check_mod = OPAL_INT_TO_BOOL(mca_sensor_file_component.check_mod); - } - } else { - ft->check_mod = OPAL_INT_TO_BOOL(tmp); - } - tmp = 0; - if (ORTE_SUCCESS != (rc = mca_base_param_find_int(c, "limit", app->env, &tmp))) { - ft->limit = mca_sensor_file_component.limit; - } else { - ft->limit = tmp; - } - opal_list_append(&jobs, &ft->super); - OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output, - "%s file %s monitored for %s%s%s with limit %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ft->file, ft->check_size ? "SIZE:" : " ", - ft->check_access ? "ACCESS TIME:" : " ", - ft->check_mod ? "MOD TIME" : " ", ft->limit)); + if (NULL == (jobdat = orte_get_job_data_object(jobid))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + return; + } + + /* must be at least one app_context, so use the first one found */ + app = NULL; + for (tmp=0; tmp < jobdat->apps->size; tmp++) { + if (NULL != (aptr = (orte_app_context_t*)opal_pointer_array_get_item(jobdat->apps, tmp))) { + app = aptr; + break; } } + if (NULL == app) { + /* got a problem */ + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + return; + } + + /* search the environ to get the filename */ + if (ORTE_SUCCESS != (rc = mca_base_param_find_string(c, "filename", app->env, &filename))) { + /* was a default file given */ + if (NULL == mca_sensor_file_component.file) { + /* can't do anything without a file */ + OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output, + "%s sensor:file no file for job %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(jobid))); + return; + } + filename = mca_sensor_file_component.file; + } + + /* create the tracking object */ + ft = OBJ_NEW(file_tracker_t); + ft->jobid = jobid; + ft->file = strdup(filename); + + /* search the environ to see what we are checking */ + tmp = 0; + if (ORTE_SUCCESS != (rc = mca_base_param_find_int(c, "check_size", app->env, &tmp))) { + /* was a default value given */ + if (0 < mca_sensor_file_component.check_size) { + ft->check_size = OPAL_INT_TO_BOOL(mca_sensor_file_component.check_size); + } + } else { + ft->check_size = OPAL_INT_TO_BOOL(tmp); + } + tmp = 0; + if (ORTE_SUCCESS != (rc = mca_base_param_find_int(c, "check_access", app->env, &tmp))) { + /* was a default value given */ + if (0 < mca_sensor_file_component.check_access) { + ft->check_access = OPAL_INT_TO_BOOL(mca_sensor_file_component.check_access); + } + } else { + ft->check_access = OPAL_INT_TO_BOOL(tmp); + } + tmp = 0; + if (ORTE_SUCCESS != (rc = mca_base_param_find_int(c, "check_mod", app->env, &tmp))) { + /* was a default value given */ + if (0 < mca_sensor_file_component.check_mod) { + ft->check_mod = OPAL_INT_TO_BOOL(mca_sensor_file_component.check_mod); + } + } else { + ft->check_mod = OPAL_INT_TO_BOOL(tmp); + } + tmp = 0; + if (ORTE_SUCCESS != (rc = mca_base_param_find_int(c, "limit", app->env, &tmp))) { + ft->limit = mca_sensor_file_component.limit; + } else { + ft->limit = tmp; + } + opal_list_append(&jobs, &ft->super); + OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output, + "%s file %s monitored for %s%s%s with limit %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ft->file, ft->check_size ? "SIZE:" : " ", + ft->check_access ? "ACCESS TIME:" : " ", + ft->check_mod ? "MOD TIME" : " ", ft->limit)); /* start sampling */ if (NULL == sample_ev && !opal_list_is_empty(&jobs)) { @@ -244,7 +241,7 @@ static void start(orte_jobid_t jobid) * for a data sample */ sample_ev = (opal_event_t *) malloc(sizeof(opal_event_t)); - opal_event_evtimer_set(opal_event_base, sample_ev, sample, sample_ev); + opal_event_evtimer_set(orte_event_base, sample_ev, sample, sample_ev); sample_time.tv_sec = mca_sensor_file_component.sample_rate; sample_time.tv_usec = 0; opal_event_evtimer_add(sample_ev, &sample_time); @@ -286,6 +283,7 @@ static void sample(int fd, short event, void *arg) struct stat buf; opal_list_item_t *item; file_tracker_t *ft; + orte_job_t *jdata; /* if we are not sampling any more, then just return */ if (NULL == sample_ev) { @@ -353,9 +351,8 @@ static void sample(int fd, short event, void *arg) if (ft->tick == ft->limit) { orte_show_help("help-orte-sensor-file.txt", "file-stalled", true, ft->file, ft->file_size, ctime(&ft->last_access), ctime(&ft->last_mod)); - orte_errmgr.update_state(ft->jobid, ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED, - NULL, ORTE_PROC_STATE_UNDEF, - 0, ORTE_ERR_PROC_STALLED); + jdata = orte_get_job_data_object(ft->jobid); + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED); } } diff --git a/orte/mca/sensor/ft_tester/sensor_ft_tester.c b/orte/mca/sensor/ft_tester/sensor_ft_tester.c index 3fb437ef3e..e7d957a232 100644 --- a/orte/mca/sensor/ft_tester/sensor_ft_tester.c +++ b/orte/mca/sensor/ft_tester/sensor_ft_tester.c @@ -1,5 +1,7 @@ /* * Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. + * All rights reserved. * * $COPYRIGHT$ * @@ -89,7 +91,7 @@ static void start(orte_jobid_t jobid) if (NULL == sample_ev) { /* startup a timer to wake us up periodically */ sample_ev = (opal_event_t *) malloc(sizeof(opal_event_t)); - opal_event_evtimer_set(opal_event_base, sample_ev, sample, sample_ev); + opal_event_evtimer_set(orte_event_base, sample_ev, sample, sample_ev); sample_time.tv_sec = mca_sensor_ft_tester_component.fail_rate; sample_time.tv_usec = 0; opal_event_evtimer_add(sample_ev, &sample_time); @@ -111,8 +113,8 @@ static void stop(orte_jobid_t jobid) static void sample(int fd, short event, void *arg) { float prob; - opal_list_item_t *item; - orte_odls_child_t *child; + orte_proc_t *child; + int i; /* if we are not sampling any more, then just return */ if (NULL == sample_ev) { @@ -141,17 +143,16 @@ static void sample(int fd, short event, void *arg) } /* see if we should kill a child */ - OPAL_THREAD_LOCK(&orte_odls_globals.mutex); - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; + for (i=0; i < orte_local_children->size; i++) { + if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { + continue; + } if (!child->alive || 0 == child->pid || ORTE_PROC_STATE_UNTERMINATED < child->state) { OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output, "%s sample:ft_tester ignoring child: %s alive %s pid %lu state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name), + ORTE_NAME_PRINT(&child->name), child->alive ? "TRUE" : "FALSE", (unsigned long)child->pid, orte_proc_state_to_str(child->state))); continue; @@ -161,26 +162,21 @@ static void sample(int fd, short event, void *arg) OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output, "%s sample:ft_tester child: %s dice: %f prob %f", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name), + ORTE_NAME_PRINT(&child->name), prob, mca_sensor_ft_tester_component.fail_prob)); if (prob < mca_sensor_ft_tester_component.fail_prob) { /* you shall die... */ OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output, "%s sample:ft_tester killing %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name))); - opal_condition_signal(&orte_odls_globals.cond); - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); + ORTE_NAME_PRINT(&child->name))); kill(child->pid, SIGTERM); - OPAL_THREAD_LOCK(&orte_odls_globals.mutex); /* are we allowing multiple deaths */ if (!mca_sensor_ft_tester_component.multi_fail) { break; } } } - opal_condition_signal(&orte_odls_globals.cond); - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); /* restart the timer */ if (NULL != sample_ev) { diff --git a/orte/mca/sensor/heartbeat/sensor_heartbeat.c b/orte/mca/sensor/heartbeat/sensor_heartbeat.c index 81d331782b..29b4148c86 100644 --- a/orte/mca/sensor/heartbeat/sensor_heartbeat.c +++ b/orte/mca/sensor/heartbeat/sensor_heartbeat.c @@ -1,6 +1,8 @@ /* * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. - * + * Copyright (c) 2011 Los Alamos National Security, LLC. All rights + * reserved. + * * $COPYRIGHT$ * * Additional copyrights may follow @@ -27,13 +29,12 @@ #include "opal/mca/pstat/pstat.h" #include "opal/mca/event/event.h" -#include "orte/threads/threads.h" #include "orte/util/show_help.h" #include "orte/util/proc_info.h" #include "orte/util/name_fns.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/odls/base/odls_private.h" -#include "orte/mca/rmcast/rmcast.h" +#include "orte/mca/rml/rml.h" #include "orte/runtime/orte_wait.h" #include "orte/runtime/orte_globals.h" @@ -59,27 +60,14 @@ orte_sensor_base_module_t orte_sensor_heartbeat_module = { static void read_stats(int fd, short event, void *arg); static void check_heartbeat(int fd, short event, void *arg); static void send_heartbeat(int fd, short event, void *arg); -static void recv_beats(int status, - orte_rmcast_channel_t channel, - orte_rmcast_seq_t seq_num, - orte_rmcast_tag_t tag, - orte_process_name_t *sender, - opal_buffer_t *buf, void* cbdata); -static void cbfunc(int status, - orte_rmcast_channel_t channel, - orte_rmcast_seq_t seq_num, - orte_rmcast_tag_t tag, - orte_process_name_t *sender, - opal_buffer_t *buf, void* cbdata) -{ - OBJ_RELEASE(buf); -} +static void recv_beats(int status, orte_process_name_t* sender, + opal_buffer_t *buffer, + orte_rml_tag_t tag, void *cbdata); /* local globals */ static opal_event_t *send_ev = NULL, *check_ev = NULL; static struct timeval send_time, check_time; static orte_job_t *daemons=NULL; -static orte_thread_ctl_t ctl; static bool already_started=false; static bool use_collected=false; @@ -91,7 +79,6 @@ static int init(void) "%s initializing heartbeat recvs", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - OBJ_CONSTRUCT(&ctl, orte_thread_ctl_t); already_started = false; /* check if resource usage is being sampled elsewhere */ @@ -116,19 +103,18 @@ static int init(void) if (NULL == (orte_sensor_base.my_node = orte_sensor_base.my_proc->node)) { return ORTE_ERR_NOT_FOUND; } - /* protect the objects */ - OBJ_RETAIN(orte_sensor_base.my_proc); - OBJ_RETAIN(orte_sensor_base.my_node); } } + /* protect the objects */ + OBJ_RETAIN(orte_sensor_base.my_proc); + OBJ_RETAIN(orte_sensor_base.my_node); /* setup to receive heartbeats */ if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_SCHEDULER) { - if (ORTE_SUCCESS != (rc = orte_rmcast.recv_buffer_nb(ORTE_RMCAST_HEARTBEAT_CHANNEL, - ORTE_RMCAST_TAG_HEARTBEAT, - ORTE_RMCAST_PERSISTENT, - recv_beats, - NULL))) { + if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, + ORTE_RML_TAG_HEARTBEAT, + ORTE_RML_PERSISTENT, + recv_beats, NULL))) { ORTE_ERROR_LOG(rc); } } @@ -149,7 +135,7 @@ static void finalize(void) check_ev = NULL; } - orte_rmcast.cancel_recv(ORTE_RMCAST_HEARTBEAT_CHANNEL, ORTE_RMCAST_TAG_HEARTBEAT); + orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_HEARTBEAT); OBJ_RELEASE(orte_sensor_base.my_proc); OBJ_RELEASE(orte_sensor_base.my_node); @@ -208,7 +194,7 @@ static void start(orte_jobid_t jobid) } /* setup the send */ send_ev = (opal_event_t*)malloc(sizeof(opal_event_t)); - opal_event_evtimer_set(opal_event_base, send_ev, send_heartbeat, send_ev); + opal_event_evtimer_set(orte_event_base, send_ev, send_heartbeat, send_ev); opal_event_evtimer_add(send_ev, &send_time); } else if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_SCHEDULER) { @@ -220,7 +206,7 @@ static void start(orte_jobid_t jobid) /* setup the check */ check_ev = (opal_event_t*)malloc(sizeof(opal_event_t)); - opal_event_evtimer_set(opal_event_base, check_ev, check_heartbeat, check_ev); + opal_event_evtimer_set(orte_event_base, check_ev, check_heartbeat, check_ev); opal_event_evtimer_add(check_ev, &check_time); /* if we want stats, then we'll setup our own timer @@ -235,7 +221,7 @@ static void start(orte_jobid_t jobid) return; } send_ev = (opal_event_t*)malloc(sizeof(opal_event_t)); - opal_event_evtimer_set(opal_event_base, send_ev, read_stats, send_ev); + opal_event_evtimer_set(orte_event_base, send_ev, read_stats, send_ev); opal_event_evtimer_add(send_ev, &send_time); } } @@ -266,8 +252,6 @@ static void read_stats(int fd, short event, void *arg) opal_pstats_t *stats, *st; opal_node_stats_t *nstats, *ndstats; - ORTE_ACQUIRE_THREAD(&ctl); - if (use_collected) { /* nothing for us to do - already have the data */ goto reset; @@ -295,7 +279,6 @@ static void read_stats(int fd, short event, void *arg) } reset: - ORTE_RELEASE_THREAD(&ctl); /* reset the timer */ opal_event_evtimer_add(tmp, &send_time); @@ -305,9 +288,8 @@ static void send_heartbeat(int fd, short event, void *arg) { opal_buffer_t *buf; opal_event_t *tmp = (opal_event_t*)arg; - int rc; - opal_list_item_t *item; - orte_odls_child_t *child; + int rc, i; + orte_proc_t *child; opal_pstats_t *st; opal_node_stats_t *nst; @@ -316,8 +298,6 @@ static void send_heartbeat(int fd, short event, void *arg) return; } - ORTE_ACQUIRE_THREAD(&ctl); - /* if my HNP hasn't been defined yet, ignore - nobody listening yet */ if (ORTE_JOBID_INVALID == ORTE_PROC_MY_HNP->jobid || ORTE_VPID_INVALID == ORTE_PROC_MY_HNP->vpid) { @@ -377,11 +357,10 @@ static void send_heartbeat(int fd, short event, void *arg) OBJ_RELEASE(st); OBJ_RELEASE(nst); /* add data for my children */ - OPAL_THREAD_LOCK(&orte_odls_globals.mutex); - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; + for (i=0; i < orte_local_children->size; i++) { + if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { + continue; + } if (!child->alive) { continue; } @@ -406,9 +385,9 @@ static void send_heartbeat(int fd, short event, void *arg) * in here */ strncpy(st->node, orte_process_info.nodename, OPAL_PSTAT_MAX_STRING_LEN); - st->rank = child->name->vpid; + st->rank = child->name.vpid; } - if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, child->name, 1, ORTE_NAME))) { + if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &child->name, 1, ORTE_NAME))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(st); continue; @@ -420,22 +399,18 @@ static void send_heartbeat(int fd, short event, void *arg) } OBJ_RELEASE(st); } - opal_condition_signal(&orte_odls_globals.cond); - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); } BEAT: /* send heartbeat */ - if (0 > (rc = orte_rmcast.send_buffer_nb(ORTE_RMCAST_HEARTBEAT_CHANNEL, - ORTE_RMCAST_TAG_HEARTBEAT, buf, - cbfunc, NULL))) { + if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buf, + ORTE_RML_TAG_HEARTBEAT, 0, + orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(buf); } reset: - ORTE_RELEASE_THREAD(&ctl); - /* reset the timer */ opal_event_evtimer_add(tmp, &send_time); } @@ -450,8 +425,6 @@ static void check_heartbeat(int fd, short dummy, void *arg) orte_proc_t *proc; opal_event_t *tmp = (opal_event_t*)arg; - ORTE_ACQUIRE_THREAD(&ctl); - OPAL_OUTPUT_VERBOSE((3, orte_sensor_base.output, "%s sensor:check_heartbeat", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); @@ -489,9 +462,7 @@ static void check_heartbeat(int fd, short dummy, void *arg) "%s sensor:check_heartbeat FAILED for daemon %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc->name))); - orte_errmgr.update_state(ORTE_PROC_MY_NAME->jobid, ORTE_JOB_STATE_HEARTBEAT_FAILED, - &proc->name, ORTE_PROC_STATE_HEARTBEAT_FAILED, - 0, ORTE_ERR_HEARTBEAT_LOST); + orte_errmgr.update_proc_state(&proc->name, ORTE_PROC_STATE_HEARTBEAT_FAILED); } else { OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output, "%s HEARTBEAT DETECTED FOR %s: NUM BEATS %d", @@ -503,18 +474,13 @@ static void check_heartbeat(int fd, short dummy, void *arg) } reset: - ORTE_RELEASE_THREAD(&ctl); - /* reset the timer */ opal_event_evtimer_add(tmp, &check_time); } -static void recv_beats(int status, - orte_rmcast_channel_t channel, - orte_rmcast_seq_t seq_num, - orte_rmcast_tag_t tag, - orte_process_name_t *sender, - opal_buffer_t *buf, void* cbdata) +static void recv_beats(int status, orte_process_name_t* sender, + opal_buffer_t *buffer, + orte_rml_tag_t tag, void *cbdata) { orte_job_t *jdata; orte_proc_t *proc; @@ -528,8 +494,6 @@ static void recv_beats(int status, return; } - ORTE_ACQUIRE_THREAD(&ctl); - /* get this daemon's object */ if (NULL != (proc = (orte_proc_t*)opal_pointer_array_get_item(daemons->procs, sender->vpid))) { OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output, @@ -550,7 +514,7 @@ static void recv_beats(int status, ORTE_ERROR_LOG(rc); /* turn off the stats */ mca_sensor_heartbeat_component.include_stats = false; - goto DEPART; + return; } /* store the node stats */ if (NULL != proc->node) { @@ -566,7 +530,7 @@ static void recv_beats(int status, ORTE_ERROR_LOG(rc); /* turn off the stats */ mca_sensor_heartbeat_component.include_stats = false; - goto DEPART; + return; } /* store this data */ if (NULL != (st = (opal_pstats_t*)opal_ring_buffer_push(&proc->stats, stats))) { @@ -601,7 +565,4 @@ static void recv_beats(int status, ORTE_ERROR_LOG(rc); } } - - DEPART: - ORTE_RELEASE_THREAD(&ctl); } diff --git a/orte/mca/sensor/resusage/sensor_resusage.c b/orte/mca/sensor/resusage/sensor_resusage.c index 2870198456..bf386aadee 100644 --- a/orte/mca/sensor/resusage/sensor_resusage.c +++ b/orte/mca/sensor/resusage/sensor_resusage.c @@ -1,6 +1,8 @@ /* * Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved. - * + * Copyright (c) 2011 Los Alamos National Security, LLC. All rights + * reserved. + * * $COPYRIGHT$ * * Additional copyrights may follow @@ -34,6 +36,7 @@ #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/odls/odls_types.h" #include "orte/mca/odls/base/odls_private.h" +#include "orte/mca/state/state.h" #include "orte/runtime/orte_globals.h" #include "orte/orted/orted.h" @@ -64,7 +67,6 @@ static void sample(int fd, short event, void *arg); /* local globals */ static opal_event_t *sample_ev = NULL; static struct timeval sample_time; -static bool created = false; static int init(void) { @@ -79,7 +81,6 @@ static int init(void) if (NULL == (jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) { orte_sensor_base.my_proc = OBJ_NEW(orte_proc_t); orte_sensor_base.my_node = OBJ_NEW(orte_node_t); - created = true; } else { if (NULL == (orte_sensor_base.my_proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, ORTE_PROC_MY_NAME->vpid))) { return ORTE_ERR_NOT_FOUND; @@ -87,7 +88,9 @@ static int init(void) if (NULL == (orte_sensor_base.my_node = orte_sensor_base.my_proc->node)) { return ORTE_ERR_NOT_FOUND; } - created = false; + /* protect the objects */ + OBJ_RETAIN(orte_sensor_base.my_proc); + OBJ_RETAIN(orte_sensor_base.my_node); } return ORTE_SUCCESS; @@ -101,10 +104,8 @@ static void finalize(void) sample_ev = NULL; } - if (created) { - OBJ_RELEASE(orte_sensor_base.my_proc); - OBJ_RELEASE(orte_sensor_base.my_node); - } + OBJ_RELEASE(orte_sensor_base.my_proc); + OBJ_RELEASE(orte_sensor_base.my_node); return; } @@ -119,7 +120,7 @@ static void start(orte_jobid_t jobid) * for a data sample */ sample_ev = (opal_event_t *) malloc(sizeof(opal_event_t)); - opal_event_evtimer_set(opal_event_base, sample_ev, sample, sample_ev); + opal_event_evtimer_set(orte_event_base, sample_ev, sample, sample_ev); sample_time.tv_sec = mca_sensor_resusage_component.sample_rate; sample_time.tv_usec = 0; opal_event_evtimer_add(sample_ev, &sample_time); @@ -142,9 +143,8 @@ static void sample(int fd, short event, void *arg) { opal_pstats_t *stats, *st; opal_node_stats_t *nstats, *nst; - int rc; - opal_list_item_t *item; - orte_odls_child_t *child, *hog=NULL; + int rc, i; + orte_proc_t *child, *hog=NULL; float in_use, max_mem; /* if we are not sampling any more, then just return */ @@ -176,11 +176,10 @@ static void sample(int fd, short event, void *arg) } /* loop through our children and update their stats */ - OPAL_THREAD_LOCK(&orte_odls_globals.mutex); - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; + for (i=0; i < orte_local_children->size; i++) { + if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { + continue; + } if (!child->alive) { continue; } @@ -196,7 +195,7 @@ static void sample(int fd, short event, void *arg) } /* the stats framework can't know nodename or rank */ strncpy(stats->node, orte_process_info.nodename, OPAL_PSTAT_MAX_STRING_LEN); - stats->rank = child->name->vpid; + stats->rank = child->name.vpid; /* store it */ if (NULL != (st = (opal_pstats_t*)opal_ring_buffer_push(&child->stats, stats))) { OBJ_RELEASE(st); @@ -210,7 +209,7 @@ static void sample(int fd, short event, void *arg) ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* compute the percentage of node memory in-use */ if (NULL == (nst = (opal_node_stats_t*)opal_ring_buffer_poke(&orte_sensor_base.my_node->stats, -1))) { - goto RELEASE; + goto RESTART; } in_use = 1.0 - (nst->free_mem / nst->total_mem); OPAL_OUTPUT_VERBOSE((2, orte_sensor_base.output, @@ -221,10 +220,10 @@ static void sample(int fd, short event, void *arg) /* loop through our children and find the biggest hog */ hog = NULL; max_mem = 0.0; - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; + for (i=0; i < orte_local_children->size; i++) { + if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { + continue; + } if (!child->alive) { continue; } @@ -238,7 +237,7 @@ static void sample(int fd, short event, void *arg) OPAL_OUTPUT_VERBOSE((5, orte_sensor_base.output, "%s PROC %s AT VSIZE %f", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name), st->vsize)); + ORTE_NAME_PRINT(&child->name), st->vsize)); if (max_mem < st->vsize) { hog = child; max_mem = st->vsize; @@ -251,8 +250,6 @@ static void sample(int fd, short event, void *arg) OPAL_OUTPUT_VERBOSE((2, orte_sensor_base.output, "%s NO CHILD: COMMITTING SUICIDE", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - opal_condition_signal(&orte_odls_globals.cond); - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); orte_errmgr.abort(ORTE_ERR_MEM_LIMIT_EXCEEDED, NULL); } else { /* report the problem - this will normally kill the proc, so @@ -261,12 +258,8 @@ static void sample(int fd, short event, void *arg) OPAL_OUTPUT_VERBOSE((2, orte_sensor_base.output, "%s REPORTING %s TO ERRMGR FOR EXCEEDING LIMITS", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(hog->name))); - opal_condition_signal(&orte_odls_globals.cond); - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); - orte_errmgr.update_state(hog->name->jobid, ORTE_JOB_STATE_UNDEF, - hog->name, ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED, - hog->pid, ORTE_ERR_MEM_LIMIT_EXCEEDED); + ORTE_NAME_PRINT(&hog->name))); + ORTE_ACTIVATE_PROC_STATE(&hog->name, ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED); } goto RESTART; } @@ -278,10 +271,10 @@ static void sample(int fd, short event, void *arg) "%s CHECKING PROC MEM", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* check my children first */ - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; + for (i=0; i < orte_local_children->size; i++) { + if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { + continue; + } if (!child->alive) { continue; } @@ -295,25 +288,16 @@ static void sample(int fd, short event, void *arg) OPAL_OUTPUT_VERBOSE((5, orte_sensor_base.output, "%s PROC %s AT VSIZE %f", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name), st->vsize)); + ORTE_NAME_PRINT(&child->name), st->vsize)); if (mca_sensor_resusage_component.proc_memory_limit <= st->vsize) { /* report the problem - this will normally kill the proc, so * we have to release the ODLS thread first */ - opal_condition_signal(&orte_odls_globals.cond); - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); - orte_errmgr.update_state(child->name->jobid, ORTE_JOB_STATE_UNDEF, - child->name, ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED, - child->pid, ORTE_ERR_MEM_LIMIT_EXCEEDED); - OPAL_THREAD_LOCK(&orte_odls_globals.mutex); + ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED); } } } - RELEASE: - opal_condition_signal(&orte_odls_globals.cond); - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); - RESTART: /* restart the timer */ if (NULL != sample_ev) { diff --git a/orte/mca/snapc/base/snapc_base_fns.c b/orte/mca/snapc/base/snapc_base_fns.c index 35566af111..3efd5dc620 100644 --- a/orte/mca/snapc/base/snapc_base_fns.c +++ b/orte/mca/snapc/base/snapc_base_fns.c @@ -80,7 +80,6 @@ void orte_snapc_base_local_snapshot_construct(orte_snapc_base_local_snapshot_t * { snapshot->process_name.jobid = 0; snapshot->process_name.vpid = 0; - ORTE_EPOCH_SET(snapshot->process_name.epoch,ORTE_EPOCH_MIN); snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE; @@ -91,7 +90,6 @@ void orte_snapc_base_local_snapshot_destruct( orte_snapc_base_local_snapshot_t * { snapshot->process_name.jobid = 0; snapshot->process_name.vpid = 0; - ORTE_EPOCH_SET(snapshot->process_name.epoch,ORTE_EPOCH_MIN); snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE; diff --git a/orte/mca/snapc/full/snapc_full_global.c b/orte/mca/snapc/full/snapc_full_global.c index f76fd83cfe..a5ee2c4475 100644 --- a/orte/mca/snapc/full/snapc_full_global.c +++ b/orte/mca/snapc/full/snapc_full_global.c @@ -429,7 +429,6 @@ int global_coord_start_ckpt(orte_snapc_base_quiesce_t *datum) new_proc = OBJ_NEW(orte_proc_t); new_proc->name.jobid = proc->name.jobid; new_proc->name.vpid = proc->name.vpid; - ORTE_EPOCH_SET(new_proc->name.epoch,proc->name.epoch); new_proc->node = OBJ_NEW(orte_node_t); new_proc->node->name = proc->node->name; opal_list_append(migrating_procs, &new_proc->super); @@ -648,7 +647,6 @@ static int global_init_job_structs(void) orted_snapshot->process_name.jobid = cur_node->daemon->name.jobid; orted_snapshot->process_name.vpid = cur_node->daemon->name.vpid; - ORTE_EPOCH_SET(orted_snapshot->process_name.epoch,cur_node->daemon->name.epoch); mask = ORTE_NS_CMP_JOBID; @@ -666,7 +664,6 @@ static int global_init_job_structs(void) app_snapshot->process_name.jobid = procs[p]->name.jobid; app_snapshot->process_name.vpid = procs[p]->name.vpid; - ORTE_EPOCH_SET(app_snapshot->process_name.epoch,procs[p]->name.epoch); opal_list_append(&(orted_snapshot->super.local_snapshots), &(app_snapshot->super)); } @@ -830,7 +827,6 @@ static int global_refresh_job_structs(void) app_snapshot->process_name.jobid = procs[p]->name.jobid; app_snapshot->process_name.vpid = procs[p]->name.vpid; - ORTE_EPOCH_SET(app_snapshot->process_name.epoch,procs[p]->name.epoch); opal_list_append(&(orted_snapshot->super.local_snapshots), &(app_snapshot->super)); } @@ -846,7 +842,6 @@ static int global_refresh_job_structs(void) orted_snapshot->process_name.jobid = cur_node->daemon->name.jobid; orted_snapshot->process_name.vpid = cur_node->daemon->name.vpid; - ORTE_EPOCH_SET(orted_snapshot->process_name.epoch,cur_node->daemon->name.epoch); mask = ORTE_NS_CMP_ALL; @@ -867,7 +862,6 @@ static int global_refresh_job_structs(void) app_snapshot->process_name.jobid = procs[p]->name.jobid; app_snapshot->process_name.vpid = procs[p]->name.vpid; - ORTE_EPOCH_SET(app_snapshot->process_name.epoch,procs[p]->name.epoch); opal_list_append(&(orted_snapshot->super.local_snapshots), &(app_snapshot->super)); } diff --git a/orte/mca/snapc/full/snapc_full_local.c b/orte/mca/snapc/full/snapc_full_local.c index c608e87a45..558f8dbafb 100644 --- a/orte/mca/snapc/full/snapc_full_local.c +++ b/orte/mca/snapc/full/snapc_full_local.c @@ -1471,7 +1471,7 @@ static int snapc_full_local_start_ckpt_open_comm(orte_snapc_full_app_snapshot_t s_time/usleep_time, max_wait_time/usleep_time)); } usleep(usleep_time); - opal_event_loop(opal_event_base, OPAL_EVLOOP_NONBLOCK); + opal_event_loop(orte_event_base, OPAL_EVLOOP_NONBLOCK); continue; } else if( 0 > (ret = access(vpid_snapshot->comm_pipe_w, F_OK) )) { @@ -1483,7 +1483,7 @@ static int snapc_full_local_start_ckpt_open_comm(orte_snapc_full_app_snapshot_t s_time/usleep_time, max_wait_time/usleep_time)); } usleep(usleep_time); - opal_event_loop(opal_event_base, OPAL_EVLOOP_NONBLOCK); + opal_event_loop(orte_event_base, OPAL_EVLOOP_NONBLOCK); continue; } else { @@ -1712,7 +1712,7 @@ static int snapc_full_local_start_ckpt_handshake(orte_snapc_full_app_snapshot_t goto cleanup; } - opal_event_set(opal_event_base, &(vpid_snapshot->comm_pipe_r_eh), + opal_event_set(orte_event_base, &(vpid_snapshot->comm_pipe_r_eh), vpid_snapshot->comm_pipe_r_fd, OPAL_EV_READ|OPAL_EV_PERSIST, snapc_full_local_comm_read_event, @@ -2033,7 +2033,6 @@ static int snapc_full_local_get_vpids(void) vpid_snapshot->process_pid = child->pid; vpid_snapshot->super.process_name.jobid = child->name->jobid; vpid_snapshot->super.process_name.vpid = child->name->vpid; - ORTE_EPOCH_SET(vpid_snapshot->super.process_name.epoch,child->name->epoch); } } @@ -2095,7 +2094,6 @@ static int snapc_full_local_refresh_vpids(void) vpid_snapshot->process_pid = child->pid; vpid_snapshot->super.process_name.jobid = child->name->jobid; vpid_snapshot->super.process_name.vpid = child->name->vpid; - ORTE_EPOCH_SET(vpid_snapshot->super.process_name.epoch,child->name->epoch); /*vpid_snapshot->migrating = true;*/ opal_list_append(&(local_global_snapshot.local_snapshots), &(vpid_snapshot->super.super)); @@ -2111,7 +2109,6 @@ static int snapc_full_local_refresh_vpids(void) vpid_snapshot->process_pid = child->pid; vpid_snapshot->super.process_name.jobid = child->name->jobid; vpid_snapshot->super.process_name.vpid = child->name->vpid; - ORTE_EPOCH_SET(vpid_snapshot->super.process_name.epoch,child->name->epoch); } } diff --git a/orte/mca/snapc/full/snapc_full_module.c b/orte/mca/snapc/full/snapc_full_module.c index aee4d113a5..3b03cad003 100644 --- a/orte/mca/snapc/full/snapc_full_module.c +++ b/orte/mca/snapc/full/snapc_full_module.c @@ -83,7 +83,6 @@ OBJ_CLASS_INSTANCE(orte_snapc_full_app_snapshot_t, void orte_snapc_full_orted_construct(orte_snapc_full_orted_snapshot_t *snapshot) { snapshot->process_name.jobid = 0; snapshot->process_name.vpid = 0; - ORTE_EPOCH_SET(snapshot->process_name.epoch,0); snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE; } @@ -91,7 +90,6 @@ void orte_snapc_full_orted_construct(orte_snapc_full_orted_snapshot_t *snapshot) void orte_snapc_full_orted_destruct( orte_snapc_full_orted_snapshot_t *snapshot) { snapshot->process_name.jobid = 0; snapshot->process_name.vpid = 0; - ORTE_EPOCH_SET(snapshot->process_name.epoch,0); snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE; } diff --git a/orte/mca/sstore/base/sstore_base_fns.c b/orte/mca/sstore/base/sstore_base_fns.c index 8fbf6214d0..c53572532a 100644 --- a/orte/mca/sstore/base/sstore_base_fns.c +++ b/orte/mca/sstore/base/sstore_base_fns.c @@ -62,7 +62,6 @@ void orte_sstore_base_local_snapshot_info_construct(orte_sstore_base_local_snaps { snapshot->process_name.jobid = 0; snapshot->process_name.vpid = 0; - ORTE_EPOCH_SET(snapshot->process_name.epoch,ORTE_EPOCH_MIN); snapshot->crs_comp = NULL; snapshot->compress_comp = NULL; @@ -76,7 +75,6 @@ void orte_sstore_base_local_snapshot_info_destruct( orte_sstore_base_local_snaps { snapshot->process_name.jobid = 0; snapshot->process_name.vpid = 0; - ORTE_EPOCH_SET(snapshot->process_name.epoch,ORTE_EPOCH_MIN); if( NULL != snapshot->crs_comp ) { free(snapshot->crs_comp); @@ -637,7 +635,6 @@ int orte_sstore_base_extract_global_metadata(orte_sstore_base_global_snapshot_in vpid_snapshot->process_name.jobid = proc.jobid; vpid_snapshot->process_name.vpid = proc.vpid; - ORTE_EPOCH_SET(vpid_snapshot->process_name.epoch,proc.epoch); } else if(0 == strncmp(token, SSTORE_METADATA_LOCAL_CRS_COMP_STR, strlen(SSTORE_METADATA_LOCAL_CRS_COMP_STR))) { vpid_snapshot->crs_comp = strdup(value); diff --git a/orte/mca/sstore/central/sstore_central_global.c b/orte/mca/sstore/central/sstore_central_global.c index b4b1457468..ff58ba7550 100644 --- a/orte/mca/sstore/central/sstore_central_global.c +++ b/orte/mca/sstore/central/sstore_central_global.c @@ -1216,7 +1216,6 @@ static int orte_sstore_central_extract_global_metadata(orte_sstore_central_globa vpid_snapshot->process_name.jobid = handle_info->jobid; vpid_snapshot->process_name.vpid = i; - ORTE_EPOCH_SET(vpid_snapshot->process_name.epoch,orte_ess.proc_get_epoch(&vpid_snapshot->process_name)); vpid_snapshot->crs_comp = NULL; global_snapshot->start_time = NULL; diff --git a/orte/mca/sstore/central/sstore_central_local.c b/orte/mca/sstore/central/sstore_central_local.c index 5647f75124..282ab7677e 100644 --- a/orte/mca/sstore/central/sstore_central_local.c +++ b/orte/mca/sstore/central/sstore_central_local.c @@ -210,7 +210,6 @@ void orte_sstore_central_local_app_snapshot_info_construct(orte_sstore_central_l { info->name.jobid = ORTE_JOBID_INVALID; info->name.vpid = ORTE_VPID_INVALID; - ORTE_EPOCH_SET(info->name.epoch,ORTE_EPOCH_MIN); info->local_location = NULL; info->metadata_filename = NULL; @@ -222,7 +221,6 @@ void orte_sstore_central_local_app_snapshot_info_destruct( orte_sstore_central_l { info->name.jobid = ORTE_JOBID_INVALID; info->name.vpid = ORTE_VPID_INVALID; - ORTE_EPOCH_SET(info->name.epoch,ORTE_EPOCH_MIN); if( NULL != info->local_location ) { free(info->local_location); @@ -535,7 +533,6 @@ static int append_new_app_handle_info(orte_sstore_central_local_snapshot_info_t app_info->name.jobid = name->jobid; app_info->name.vpid = name->vpid; - ORTE_EPOCH_SET(app_info->name.epoch,name->epoch); opal_list_append(handle_info->app_info_handle, &(app_info->super)); diff --git a/orte/mca/sstore/stage/sstore_stage_global.c b/orte/mca/sstore/stage/sstore_stage_global.c index 4d88284d72..39d9c99e9a 100644 --- a/orte/mca/sstore/stage/sstore_stage_global.c +++ b/orte/mca/sstore/stage/sstore_stage_global.c @@ -1218,10 +1218,8 @@ static int process_local_push(orte_process_name_t* peer, opal_buffer_t* buffer, p_set = OBJ_NEW(orte_filem_base_process_set_t); p_set->source.jobid = peer->jobid; p_set->source.vpid = peer->vpid; - ORTE_EPOCH_SET(p_set->source.epoch,peer->epoch); p_set->sink.jobid = ORTE_PROC_MY_NAME->jobid; p_set->sink.vpid = ORTE_PROC_MY_NAME->vpid; - ORTE_EPOCH_SET(p_set->sink.epoch,ORTE_PROC_MY_NAME->epoch); opal_list_append(&(filem_request->process_sets), &(p_set->super) ); } @@ -1706,7 +1704,6 @@ static int orte_sstore_stage_extract_global_metadata(orte_sstore_stage_global_sn vpid_snapshot->process_name.jobid = handle_info->jobid; vpid_snapshot->process_name.vpid = i; - ORTE_EPOCH_SET(vpid_snapshot->process_name.epoch,orte_ess.proc_get_epoch(&vpid_snapshot->process_name)); /* JJH: Currently we do not have this information since we do not save * individual vpid info in the Global SStore. It is in the metadata diff --git a/orte/mca/sstore/stage/sstore_stage_local.c b/orte/mca/sstore/stage/sstore_stage_local.c index ba1e4b6b70..6de4479450 100644 --- a/orte/mca/sstore/stage/sstore_stage_local.c +++ b/orte/mca/sstore/stage/sstore_stage_local.c @@ -287,7 +287,6 @@ void orte_sstore_stage_local_app_snapshot_info_construct(orte_sstore_stage_local { info->name.jobid = ORTE_JOBID_INVALID; info->name.vpid = ORTE_VPID_INVALID; - ORTE_EPOCH_SET(info->name.epoch,ORTE_EPOCH_MIN); info->local_location = NULL; info->compressed_local_location = NULL; @@ -302,7 +301,6 @@ void orte_sstore_stage_local_app_snapshot_info_destruct( orte_sstore_stage_local { info->name.jobid = ORTE_JOBID_INVALID; info->name.vpid = ORTE_VPID_INVALID; - ORTE_EPOCH_SET(info->name.epoch,ORTE_EPOCH_MIN); if( NULL != info->local_location ) { free(info->local_location); @@ -1014,7 +1012,6 @@ static int append_new_app_handle_info(orte_sstore_stage_local_snapshot_info_t *h app_info->name.jobid = name->jobid; app_info->name.vpid = name->vpid; - ORTE_EPOCH_SET(app_info->name.epoch,name->epoch); opal_list_append(handle_info->app_info_handle, &(app_info->super)); @@ -2057,17 +2054,14 @@ static int orte_sstore_stage_local_preload_files(char **local_location, bool *sk /* if I am the HNP, then use me as the source */ p_set->source.jobid = ORTE_PROC_MY_NAME->jobid; p_set->source.vpid = ORTE_PROC_MY_NAME->vpid; - ORTE_EPOCH_SET(p_set->source.epoch,ORTE_PROC_MY_NAME->epoch); } else { /* otherwise, set the HNP as the source */ p_set->source.jobid = ORTE_PROC_MY_HNP->jobid; p_set->source.vpid = ORTE_PROC_MY_HNP->vpid; - ORTE_EPOCH_SET(p_set->source.epoch,ORTE_PROC_MY_HNP->epoch); } p_set->sink.jobid = ORTE_PROC_MY_NAME->jobid; p_set->sink.vpid = ORTE_PROC_MY_NAME->vpid; - ORTE_EPOCH_SET(p_set->sink.epoch,ORTE_PROC_MY_NAME->epoch); opal_list_append(&(filem_request->process_sets), &(p_set->super) ); /* Define the file set */ diff --git a/orte/mca/state/Makefile.am b/orte/mca/state/Makefile.am new file mode 100644 index 0000000000..6763c71c41 --- /dev/null +++ b/orte/mca/state/Makefile.am @@ -0,0 +1,31 @@ +# +# Copyright (c) 2011 Los Alamos National Security, LLC. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# main library setup +noinst_LTLIBRARIES = libmca_state.la +libmca_state_la_SOURCES = + +# pkgdata setup +dist_pkgdata_DATA = + +# local files +headers = state.h state_types.h +libmca_state_la_SOURCES += $(headers) + +# Conditionally install the header files +if WANT_INSTALL_HEADERS +ortedir = $(includedir)/openmpi/$(subdir) +nobase_orte_HEADERS = $(headers) +endif + +include base/Makefile.am + +distclean-local: + rm -f base/static-components.h diff --git a/orte/mca/errmgr/app/.windows b/orte/mca/state/app/.windows similarity index 100% rename from orte/mca/errmgr/app/.windows rename to orte/mca/state/app/.windows diff --git a/orte/mca/state/app/Makefile.am b/orte/mca/state/app/Makefile.am new file mode 100644 index 0000000000..8fa9dfa564 --- /dev/null +++ b/orte/mca/state/app/Makefile.am @@ -0,0 +1,35 @@ +# +# Copyright (c) 2011 Los Alamos National Security, LLC. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +sources = \ + state_app.h \ + state_app_component.c \ + state_app.c + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +if MCA_BUILD_orte_state_app_DSO +component_noinst = +component_install = mca_state_app.la +else +component_noinst = libmca_state_app.la +component_install = +endif + +mcacomponentdir = $(pkglibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_state_app_la_SOURCES = $(sources) +mca_state_app_la_LDFLAGS = -module -avoid-version + +noinst_LTLIBRARIES = $(component_noinst) +libmca_state_app_la_SOURCES =$(sources) +libmca_state_app_la_LDFLAGS = -module -avoid-version diff --git a/orte/mca/state/app/state_app.c b/orte/mca/state/app/state_app.c new file mode 100644 index 0000000000..8dd87bcc5b --- /dev/null +++ b/orte/mca/state/app/state_app.c @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2011 Los Alamos National Security, LLC. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" + +#include +#ifdef HAVE_UNISTD_H +#include +#endif /* HAVE_UNISTD_H */ +#ifdef HAVE_STRING_H +#include +#endif + +#include "opal/util/output.h" + +#include "orte/mca/state/state.h" +#include "orte/mca/state/base/state_private.h" +#include "state_app.h" + +/* + * Module functions: Global + */ +static int init(void); +static int finalize(void); + +/****************** + * APP module - just uses base functions after + * initializing the proc state machine. Job state + * machine is unused by application procs at this + * time. + ******************/ +orte_state_base_module_t orte_state_app_module = { + init, + finalize, + orte_state_base_activate_job_state, + orte_state_base_add_job_state, + orte_state_base_set_job_state_callback, + orte_state_base_set_job_state_priority, + orte_state_base_remove_job_state, + orte_state_base_activate_proc_state, + orte_state_base_add_proc_state, + orte_state_base_set_proc_state_callback, + orte_state_base_set_proc_state_priority, + orte_state_base_remove_proc_state +}; + +/************************ + * API Definitions + ************************/ +static int init(void) +{ + /* we don't use the job state machine, so just + * setup the proc state machine + */ + OBJ_CONSTRUCT(&orte_proc_states, opal_list_t); + + return ORTE_SUCCESS; +} + +static int finalize(void) +{ + opal_list_item_t *item; + + /* cleanup the proc state machine */ + while (NULL != (item = opal_list_remove_first(&orte_proc_states))) { + OBJ_RELEASE(item); + } + OBJ_DESTRUCT(&orte_proc_states); + + return ORTE_SUCCESS; +} diff --git a/orte/mca/state/app/state_app.h b/orte/mca/state/app/state_app.h new file mode 100644 index 0000000000..a358c66d5d --- /dev/null +++ b/orte/mca/state/app/state_app.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2011 Los Alamos National Security, LLC. + * All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/** + * @file + * + */ + +#ifndef MCA_STATE_APP_EXPORT_H +#define MCA_STATE_APP_EXPORT_H + +#include "orte_config.h" + +#include "orte/mca/state/state.h" + +BEGIN_C_DECLS + +/* + * Local Component structures + */ + +ORTE_MODULE_DECLSPEC extern orte_state_base_component_t mca_state_app_component; + +ORTE_DECLSPEC extern orte_state_base_module_t orte_state_app_module; + +END_C_DECLS + +#endif /* MCA_STATE_APP_EXPORT_H */ diff --git a/orte/mca/state/app/state_app_component.c b/orte/mca/state/app/state_app_component.c new file mode 100644 index 0000000000..f041fc49cb --- /dev/null +++ b/orte/mca/state/app/state_app_component.c @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2011 Los Alamos National Security, LLC. + * All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" +#include "opal/util/output.h" + +#include "orte/mca/state/state.h" +#include "orte/mca/state/base/base.h" +#include "state_app.h" + +/* + * Public string for version number + */ +const char *orte_state_app_component_version_string = + "ORTE STATE app MCA component version " ORTE_VERSION; + +/* + * Local functionality + */ +static int state_app_open(void); +static int state_app_close(void); +static int state_app_component_query(mca_base_module_t **module, int *priority); + +/* + * Instantiate the public struct with all of our public information + * and pointer to our public functions in it + */ +orte_state_base_component_t mca_state_app_component = +{ + /* Handle the general mca_component_t struct containing + * meta information about the component + */ + { + ORTE_STATE_BASE_VERSION_1_0_0, + /* Component name and version */ + "app", + ORTE_MAJOR_VERSION, + ORTE_MINOR_VERSION, + ORTE_RELEASE_VERSION, + + /* Component open and close functions */ + state_app_open, + state_app_close, + state_app_component_query + }, + { + /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_CHECKPOINT + }, +}; + +static int my_priority=1000; + +static int state_app_open(void) +{ + return ORTE_SUCCESS; +} + +static int state_app_close(void) +{ + return ORTE_SUCCESS; +} + +static int state_app_component_query(mca_base_module_t **module, int *priority) +{ + if (ORTE_PROC_IS_APP) { + /* set our priority high as we are the default for apps */ + *priority = my_priority; + *module = (mca_base_module_t *)&orte_state_app_module; + return ORTE_SUCCESS; + } + + *priority = -1; + *module = NULL; + return ORTE_ERROR; +} diff --git a/orte/mca/state/base/Makefile.am b/orte/mca/state/base/Makefile.am new file mode 100644 index 0000000000..731169f12f --- /dev/null +++ b/orte/mca/state/base/Makefile.am @@ -0,0 +1,19 @@ +# +# Copyright (c) 2011 Los Alamos National Security, LLC. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +headers += \ + base/state_private.h \ + base/base.h + +libmca_state_la_SOURCES += \ + base/state_base_close.c \ + base/state_base_select.c \ + base/state_base_open.c \ + base/state_base_fns.c diff --git a/orte/mca/state/base/base.h b/orte/mca/state/base/base.h new file mode 100644 index 0000000000..2ec40120a9 --- /dev/null +++ b/orte/mca/state/base/base.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2011 Los Alamos National Security, LLC. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** @file: + */ + +#ifndef ORTE_MCA_STATE_BASE_H +#define ORTE_MCA_STATE_BASE_H + +/* + * includes + */ +#include "orte_config.h" +#include "orte/constants.h" + +#include "opal/class/opal_list.h" + +#include "opal/mca/mca.h" +#include "orte/mca/state/state.h" + + +BEGIN_C_DECLS + +/* + * MCA Framework functions + */ +ORTE_DECLSPEC int orte_state_base_open(void); +ORTE_DECLSPEC int orte_state_base_select(void); +ORTE_DECLSPEC int orte_state_base_close(void); + +/** + * Output and component variables + */ +ORTE_DECLSPEC extern opal_list_t orte_state_base_components_available; + +/** + * Internal module reference + */ +ORTE_DECLSPEC extern orte_state_base_component_t orte_state_base_selected_component; + + +/* debug tools */ +ORTE_DECLSPEC void orte_state_base_print_job_state_machine(void); + +ORTE_DECLSPEC void orte_state_base_print_proc_state_machine(void); + +END_C_DECLS + +#endif diff --git a/orte/mca/state/base/state_base_close.c b/orte/mca/state/base/state_base_close.c new file mode 100644 index 0000000000..4f7d08590b --- /dev/null +++ b/orte/mca/state/base/state_base_close.c @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" +#include "orte/constants.h" + +#include + +#include "opal/mca/mca.h" +#include "opal/mca/base/base.h" + +#include "orte/mca/state/state.h" +#include "orte/mca/state/base/base.h" +#include "orte/mca/state/base/state_private.h" + + +int orte_state_base_close(void) +{ + /* if not initialized, then skip this action. */ + if( !orte_state_base.initialized ) { + return ORTE_SUCCESS; + } + + /* Close selected component */ + if( NULL != orte_state.finalize ) { + orte_state.finalize(); + } + + /* Close all remaining available components */ + mca_base_components_close(orte_state_base_output, + &orte_state_base_components_available, + NULL); + + orte_state_base.initialized = false; + + return ORTE_SUCCESS; +} diff --git a/orte/mca/state/base/state_base_fns.c b/orte/mca/state/base/state_base_fns.c new file mode 100644 index 0000000000..f3d04970a0 --- /dev/null +++ b/orte/mca/state/base/state_base_fns.c @@ -0,0 +1,362 @@ +/* + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/** @file **/ + +#include "orte_config.h" +#include "orte/constants.h" + +#include "opal/class/opal_list.h" +#include "opal/mca/event/event.h" + +#include "orte/runtime/orte_globals.h" +#include "orte/mca/plm/plm_types.h" + +#include "orte/mca/state/base/base.h" +#include "orte/mca/state/base/state_private.h" + +void orte_state_base_activate_job_state(orte_job_t *jdata, + orte_job_state_t state) +{ + opal_list_item_t *itm, *any=NULL, *error=NULL; + orte_state_t *s; + orte_state_caddy_t *caddy; + + for (itm = opal_list_get_first(&orte_job_states); + itm != opal_list_get_end(&orte_job_states); + itm = opal_list_get_next(itm)) { + s = (orte_state_t*)itm; + if (s->job_state == ORTE_JOB_STATE_ANY) { + /* save this place */ + any = itm; + } + if (s->job_state == ORTE_JOB_STATE_ERROR) { + error = itm; + } + if (s->job_state == state) { + OPAL_OUTPUT_VERBOSE((1, orte_state_base_output, + "%s ACTIVATING JOB %s STATE %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (NULL == jdata) ? "NULL" : ORTE_JOBID_PRINT(jdata->jobid), + orte_job_state_to_str(state))); + if (NULL == s->cbfunc) { + OPAL_OUTPUT_VERBOSE((1, orte_state_base_output, + "%s NULL CBFUNC FOR JOB %s STATE %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (NULL == jdata) ? "ALL" : ORTE_JOBID_PRINT(jdata->jobid), + orte_job_state_to_str(state))); + return; + } + caddy = OBJ_NEW(orte_state_caddy_t); + if (NULL != jdata) { + caddy->jdata = jdata; + caddy->job_state = state; + OBJ_RETAIN(jdata); + } + opal_event_set(orte_event_base, &caddy->ev, -1, OPAL_EV_WRITE, s->cbfunc, caddy); + opal_event_set_priority(&caddy->ev, s->priority); + opal_event_active(&caddy->ev, OPAL_EV_WRITE, 1); + return; + } + } + /* if we get here, then the state wasn't found, so execute + * the default handler if it is defined + */ + if (ORTE_JOB_STATE_ERROR < state && NULL != error) { + s = (orte_state_t*)error; + } else if (NULL != any) { + s = (orte_state_t*)any; + } else { + OPAL_OUTPUT_VERBOSE((1, orte_state_base_output, + "ACTIVATE: ANY STATE NOT FOUND")); + return; + } + if (NULL == s->cbfunc) { + OPAL_OUTPUT_VERBOSE((1, orte_state_base_output, + "ACTIVATE: ANY STATE HANDLER NOT DEFINED")); + return; + } + caddy = OBJ_NEW(orte_state_caddy_t); + if (NULL != jdata) { + caddy->jdata = jdata; + caddy->job_state = state; + OBJ_RETAIN(jdata); + } + opal_event_set(orte_event_base, &caddy->ev, -1, OPAL_EV_WRITE, s->cbfunc, caddy); + opal_event_set_priority(&caddy->ev, s->priority); + opal_event_active(&caddy->ev, OPAL_EV_WRITE, 1); +} + + +int orte_state_base_add_job_state(orte_job_state_t state, + orte_state_cbfunc_t cbfunc, + int priority) +{ + opal_list_item_t *item; + orte_state_t *st; + + /* check for uniqueness */ + for (item = opal_list_get_first(&orte_job_states); + item != opal_list_get_end(&orte_job_states); + item = opal_list_get_next(item)) { + st = (orte_state_t*)item; + if (st->job_state == state) { + OPAL_OUTPUT_VERBOSE((1, orte_state_base_output, + "DUPLICATE STATE DEFINED: %s", + orte_job_state_to_str(state))); + return ORTE_ERR_BAD_PARAM; + } + } + + st = OBJ_NEW(orte_state_t); + st->job_state = state; + st->cbfunc = cbfunc; + st->priority = priority; + opal_list_append(&orte_job_states, &(st->super)); + + return ORTE_SUCCESS; +} + +int orte_state_base_set_job_state_callback(orte_job_state_t state, + orte_state_cbfunc_t cbfunc) +{ + opal_list_item_t *item; + orte_state_t *st; + + for (item = opal_list_get_first(&orte_job_states); + item != opal_list_get_end(&orte_job_states); + item = opal_list_get_next(item)) { + st = (orte_state_t*)item; + if (st->job_state == state) { + st->cbfunc = cbfunc; + return ORTE_SUCCESS; + } + } + return ORTE_ERR_NOT_FOUND; +} + +int orte_state_base_set_job_state_priority(orte_job_state_t state, + int priority) +{ + opal_list_item_t *item; + orte_state_t *st; + + for (item = opal_list_get_first(&orte_job_states); + item != opal_list_get_end(&orte_job_states); + item = opal_list_get_next(item)) { + st = (orte_state_t*)item; + if (st->job_state == state) { + st->priority = priority; + return ORTE_SUCCESS; + } + } + return ORTE_ERR_NOT_FOUND; +} + +int orte_state_base_remove_job_state(orte_job_state_t state) +{ + opal_list_item_t *item; + orte_state_t *st; + + for (item = opal_list_get_first(&orte_job_states); + item != opal_list_get_end(&orte_job_states); + item = opal_list_get_next(item)) { + st = (orte_state_t*)item; + if (st->job_state == state) { + opal_list_remove_item(&orte_job_states, item); + OBJ_RELEASE(item); + return ORTE_SUCCESS; + } + } + return ORTE_ERR_NOT_FOUND; +} + +void orte_state_base_print_job_state_machine(void) +{ + opal_list_item_t *item; + orte_state_t *st; + + opal_output(0, "ORTE_JOB_STATE_MACHINE:"); + for (item = opal_list_get_first(&orte_job_states); + item != opal_list_get_end(&orte_job_states); + item = opal_list_get_next(item)) { + st = (orte_state_t*)item; + opal_output(0, "\tState: %s cbfunc: %s", + orte_job_state_to_str(st->job_state), + (NULL == st->cbfunc) ? "NULL" : "DEFINED"); + } +} + + +/**** PROC STATE MACHINE ****/ +void orte_state_base_activate_proc_state(orte_process_name_t *proc, + orte_proc_state_t state) +{ + opal_list_item_t *itm, *any=NULL, *error=NULL; + orte_state_t *s; + orte_state_caddy_t *caddy; + + for (itm = opal_list_get_first(&orte_proc_states); + itm != opal_list_get_end(&orte_proc_states); + itm = opal_list_get_next(itm)) { + s = (orte_state_t*)itm; + if (s->proc_state == ORTE_PROC_STATE_ANY) { + /* save this place */ + any = itm; + } + if (s->proc_state == ORTE_PROC_STATE_ERROR) { + error = itm; + } + if (s->proc_state == state) { + OPAL_OUTPUT_VERBOSE((1, orte_state_base_output, + "%s ACTIVATING PROC %s STATE %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc), + orte_proc_state_to_str(state))); + if (NULL == s->cbfunc) { + OPAL_OUTPUT_VERBOSE((1, orte_state_base_output, + "%s NULL CBFUNC FOR PROC %s STATE %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc), + orte_proc_state_to_str(state))); + return; + } + caddy = OBJ_NEW(orte_state_caddy_t); + caddy->name = *proc; + caddy->proc_state = state; + opal_event_set(orte_event_base, &caddy->ev, -1, OPAL_EV_WRITE, s->cbfunc, caddy); + opal_event_set_priority(&caddy->ev, s->priority); + opal_event_active(&caddy->ev, OPAL_EV_WRITE, 1); + return; + } + } + /* if we get here, then the state wasn't found, so execute + * the default handler if it is defined + */ + if (ORTE_PROC_STATE_ERROR < state && NULL != error) { + s = (orte_state_t*)error; + } else if (NULL != any) { + s = (orte_state_t*)any; + } else { + OPAL_OUTPUT_VERBOSE((1, orte_state_base_output, + "INCREMENT: ANY STATE NOT FOUND")); + return; + } + if (NULL == s->cbfunc) { + OPAL_OUTPUT_VERBOSE((1, orte_state_base_output, + "ACTIVATE: ANY STATE HANDLER NOT DEFINED")); + return; + } + caddy = OBJ_NEW(orte_state_caddy_t); + caddy->name = *proc; + caddy->proc_state = state; + opal_event_set(orte_event_base, &caddy->ev, -1, OPAL_EV_WRITE, s->cbfunc, caddy); + opal_event_set_priority(&caddy->ev, s->priority); + opal_event_active(&caddy->ev, OPAL_EV_WRITE, 1); +} + +int orte_state_base_add_proc_state(orte_proc_state_t state, + orte_state_cbfunc_t cbfunc, + int priority) +{ + opal_list_item_t *item; + orte_state_t *st; + + /* check for uniqueness */ + for (item = opal_list_get_first(&orte_proc_states); + item != opal_list_get_end(&orte_proc_states); + item = opal_list_get_next(item)) { + st = (orte_state_t*)item; + if (st->proc_state == state) { + OPAL_OUTPUT_VERBOSE((1, orte_state_base_output, + "DUPLICATE STATE DEFINED: %s", + orte_proc_state_to_str(state))); + return ORTE_ERR_BAD_PARAM; + } + } + + st = OBJ_NEW(orte_state_t); + st->proc_state = state; + st->cbfunc = cbfunc; + st->priority = priority; + opal_list_append(&orte_proc_states, &(st->super)); + + return ORTE_SUCCESS; +} + +int orte_state_base_set_proc_state_callback(orte_proc_state_t state, + orte_state_cbfunc_t cbfunc) +{ + opal_list_item_t *item; + orte_state_t *st; + + for (item = opal_list_get_first(&orte_proc_states); + item != opal_list_get_end(&orte_proc_states); + item = opal_list_get_next(item)) { + st = (orte_state_t*)item; + if (st->proc_state == state) { + st->cbfunc = cbfunc; + return ORTE_SUCCESS; + } + } + return ORTE_ERR_NOT_FOUND; +} + +int orte_state_base_set_proc_state_priority(orte_proc_state_t state, + int priority) +{ + opal_list_item_t *item; + orte_state_t *st; + + for (item = opal_list_get_first(&orte_proc_states); + item != opal_list_get_end(&orte_proc_states); + item = opal_list_get_next(item)) { + st = (orte_state_t*)item; + if (st->proc_state == state) { + st->priority = priority; + return ORTE_SUCCESS; + } + } + return ORTE_ERR_NOT_FOUND; +} + +int orte_state_base_remove_proc_state(orte_proc_state_t state) +{ + opal_list_item_t *item; + orte_state_t *st; + + for (item = opal_list_get_first(&orte_proc_states); + item != opal_list_get_end(&orte_proc_states); + item = opal_list_get_next(item)) { + st = (orte_state_t*)item; + if (st->proc_state == state) { + opal_list_remove_item(&orte_proc_states, item); + OBJ_RELEASE(item); + return ORTE_SUCCESS; + } + } + return ORTE_ERR_NOT_FOUND; +} + +void orte_state_base_print_proc_state_machine(void) +{ + opal_list_item_t *item; + orte_state_t *st; + + opal_output(0, "ORTE_PROC_STATE_MACHINE:"); + for (item = opal_list_get_first(&orte_proc_states); + item != opal_list_get_end(&orte_proc_states); + item = opal_list_get_next(item)) { + st = (orte_state_t*)item; + opal_output(0, "\tState: %s cbfunc: %s", + orte_proc_state_to_str(st->proc_state), + (NULL == st->cbfunc) ? "NULL" : "DEFINED"); + } +} + diff --git a/orte/mca/state/base/state_base_open.c b/orte/mca/state/base/state_base_open.c new file mode 100644 index 0000000000..199969a7d8 --- /dev/null +++ b/orte/mca/state/base/state_base_open.c @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2011 Los Alamos National Security, LLC. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + + +#include "orte_config.h" +#include "orte/constants.h" + +#ifdef HAVE_STRING_H +#include +#endif +#ifdef HAVE_UNISTD_H +#include +#endif +#ifdef HAVE_SYS_TYPES_H +#include +#endif + +#include "opal/mca/mca.h" +#include "opal/mca/base/base.h" +#include "opal/mca/base/mca_base_param.h" + +#include "opal/class/opal_list.h" +#include "opal/util/output.h" + +#include "orte/mca/plm/plm_types.h" +#include "orte/runtime/orte_globals.h" + +#include "orte/mca/state/base/base.h" +#include "orte/mca/state/base/state_private.h" + +#include "orte/mca/state/base/static-components.h" + +/* + * Globals + */ +opal_list_t orte_state_base_components_available; +orte_state_base_t orte_state_base; +orte_state_base_component_t orte_state_base_selected_component; +int orte_state_base_output; + +orte_state_base_module_t orte_state; + +/** + * Function for finding and opening either all MCA components, or the one + * that was specifically requested via a MCA parameter. + */ +int orte_state_base_open(void) +{ + /* Only pass this way once */ + if( orte_state_base.initialized ) { + return ORTE_SUCCESS; + } + + orte_state_base_output = opal_output_open(NULL); + + /* + * Open up all available components + */ + if (ORTE_SUCCESS != + mca_base_components_open("state", + orte_state_base_output, + mca_state_base_static_components, + &orte_state_base_components_available, + true)) { + return ORTE_ERROR; + } + + orte_state_base.initialized = true; + + return ORTE_SUCCESS; +} + +static void orte_state_construct(orte_state_t *state) +{ + state->job_state = ORTE_JOB_STATE_UNDEF; + state->proc_state = ORTE_PROC_STATE_UNDEF; + state->cbfunc = NULL; + state->priority = ORTE_INFO_PRI; +} +OBJ_CLASS_INSTANCE(orte_state_t, + opal_list_item_t, + orte_state_construct, + NULL); + +static void orte_state_caddy_construct(orte_state_caddy_t *caddy) +{ + memset(&caddy->ev, 0, sizeof(opal_event_t)); + caddy->jdata = NULL; +} +static void orte_state_caddy_destruct(orte_state_caddy_t *caddy) +{ + opal_event_del(&caddy->ev); + if (NULL != caddy->jdata) { + OBJ_RELEASE(caddy->jdata); + } +} +OBJ_CLASS_INSTANCE(orte_state_caddy_t, + opal_object_t, + orte_state_caddy_construct, + orte_state_caddy_destruct); + diff --git a/orte/mca/state/base/state_base_select.c b/orte/mca/state/base/state_base_select.c new file mode 100644 index 0000000000..48e20ba50d --- /dev/null +++ b/orte/mca/state/base/state_base_select.c @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + + +#include "orte_config.h" +#include "orte/constants.h" + +#ifdef HAVE_STRING_H +#include +#endif + +#include "opal/mca/mca.h" +#include "opal/mca/base/base.h" +#include "opal/mca/base/mca_base_param.h" +#include "opal/util/output.h" + +#include "orte/mca/state/base/base.h" +#include "orte/mca/state/base/state_private.h" + +int orte_state_base_select(void) +{ + int exit_status = OPAL_SUCCESS; + orte_state_base_component_t *best_component = NULL; + orte_state_base_module_t *best_module = NULL; + + /* + * Select the best component + */ + if( OPAL_SUCCESS != mca_base_select("state", orte_state_base_output, + &orte_state_base_components_available, + (mca_base_module_t **) &best_module, + (mca_base_component_t **) &best_component) ) { + /* This will only happen if no component was selected */ + exit_status = ORTE_ERROR; + goto cleanup; + } + + /* Save the winner */ + orte_state_base_selected_component = *best_component; + orte_state = *best_module; + + /* Initialize the winner */ + if (NULL != best_module) { + if (OPAL_SUCCESS != orte_state.init()) { + exit_status = OPAL_ERROR; + goto cleanup; + } + } + + cleanup: + return exit_status; +} diff --git a/orte/mca/state/base/state_private.h b/orte/mca/state/base/state_private.h new file mode 100644 index 0000000000..d87e4b4cf7 --- /dev/null +++ b/orte/mca/state/base/state_private.h @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2011 Los Alamos National Security, LLC. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** @file: + */ + +#ifndef ORTE_MCA_STATE_PRIVATE_H +#define ORTE_MCA_STATE_PRIVATE_H + +/* + * includes + */ +#include "orte_config.h" +#include "orte/constants.h" +#include "orte/types.h" + +#ifdef HAVE_UNISTD_H +#include +#endif /* HAVE_UNISTD_H */ + +#include "orte/mca/plm/plm_types.h" +#include "orte/runtime/orte_globals.h" + +#include "orte/mca/state/state.h" + +/* + * Functions for use solely within the ERRMGR framework + */ +BEGIN_C_DECLS + +/* define a struct to hold framework-global values */ +typedef struct { + bool initialized; +} orte_state_base_t; + +ORTE_DECLSPEC extern orte_state_base_t orte_state_base; + +/* + * Base functions + */ +ORTE_DECLSPEC void orte_state_base_activate_job_state(orte_job_t *jdata, + orte_job_state_t state); + +ORTE_DECLSPEC int orte_state_base_add_job_state(orte_job_state_t state, + orte_state_cbfunc_t cbfunc, + int priority); + +ORTE_DECLSPEC int orte_state_base_set_job_state_callback(orte_job_state_t state, + orte_state_cbfunc_t cbfunc); + +ORTE_DECLSPEC int orte_state_base_set_job_state_priority(orte_job_state_t state, + int priority); + +ORTE_DECLSPEC int orte_state_base_remove_job_state(orte_job_state_t state); + +ORTE_DECLSPEC void orte_util_print_job_state_machine(void); + + +ORTE_DECLSPEC void orte_state_base_activate_proc_state(orte_process_name_t *proc, + orte_proc_state_t state); + +ORTE_DECLSPEC int orte_state_base_add_proc_state(orte_proc_state_t state, + orte_state_cbfunc_t cbfunc, + int priority); + +ORTE_DECLSPEC int orte_state_base_set_proc_state_callback(orte_proc_state_t state, + orte_state_cbfunc_t cbfunc); + +ORTE_DECLSPEC int orte_state_base_set_proc_state_priority(orte_proc_state_t state, + int priority); + +ORTE_DECLSPEC int orte_state_base_remove_proc_state(orte_proc_state_t state); + +ORTE_DECLSPEC void orte_util_print_proc_state_machine(void); + + +END_C_DECLS +#endif diff --git a/orte/mca/errmgr/orted/.windows b/orte/mca/state/hnp/.windows similarity index 100% rename from orte/mca/errmgr/orted/.windows rename to orte/mca/state/hnp/.windows diff --git a/orte/mca/state/hnp/Makefile.am b/orte/mca/state/hnp/Makefile.am new file mode 100644 index 0000000000..47dcf2f43a --- /dev/null +++ b/orte/mca/state/hnp/Makefile.am @@ -0,0 +1,35 @@ +# +# Copyright (c) 2011 Los Alamos National Security, LLC. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +sources = \ + state_hnp.h \ + state_hnp_component.c \ + state_hnp.c + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +if MCA_BUILD_orte_state_hnp_DSO +component_noinst = +component_install = mca_state_hnp.la +else +component_noinst = libmca_state_hnp.la +component_install = +endif + +mcacomponentdir = $(pkglibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_state_hnp_la_SOURCES = $(sources) +mca_state_hnp_la_LDFLAGS = -module -avoid-version + +noinst_LTLIBRARIES = $(component_noinst) +libmca_state_hnp_la_SOURCES =$(sources) +libmca_state_hnp_la_LDFLAGS = -module -avoid-version diff --git a/orte/mca/errmgr/app/configure.m4 b/orte/mca/state/hnp/configure.m4 similarity index 54% rename from orte/mca/errmgr/app/configure.m4 rename to orte/mca/state/hnp/configure.m4 index bf12b54215..9f40b478a1 100644 --- a/orte/mca/errmgr/app/configure.m4 +++ b/orte/mca/state/hnp/configure.m4 @@ -8,12 +8,12 @@ # # $HEADER$ # -# MCA_errmgr_app_CONFIG([action-if-found], [action-if-not-found]) +# MCA_state_hnp_CONFIG([action-if-found], [action-if-not-found]) # ----------------------------------------------------------- -AC_DEFUN([MCA_orte_errmgr_app_CONFIG], [ - AC_CONFIG_FILES([orte/mca/errmgr/app/Makefile]) +AC_DEFUN([MCA_orte_state_hnp_CONFIG], [ + AC_CONFIG_FILES([orte/mca/state/hnp/Makefile]) - AS_IF([test "$orte_enable_resilient_code" = 1 -a "$orte_without_full_support" = 0], + AS_IF([test "$orte_without_full_support" = 0], [$1], [$2]) ]) diff --git a/orte/mca/state/hnp/state_hnp.c b/orte/mca/state/hnp/state_hnp.c new file mode 100644 index 0000000000..02a014e757 --- /dev/null +++ b/orte/mca/state/hnp/state_hnp.c @@ -0,0 +1,520 @@ +/* + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" + +#include +#ifdef HAVE_UNISTD_H +#include +#endif /* HAVE_UNISTD_H */ +#ifdef HAVE_STRING_H +#include +#endif + +#include "opal/util/output.h" + +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/iof/iof.h" +#include "orte/mca/plm/base/base.h" +#include "orte/mca/ras/base/base.h" +#include "orte/mca/rmaps/base/base.h" +#include "orte/mca/routed/routed.h" +#include "orte/mca/notifier/notifier.h" +#include "orte/mca/sensor/sensor.h" +#include "orte/util/session_dir.h" +#include "orte/runtime/orte_quit.h" + +#include "orte/mca/state/state.h" +#include "orte/mca/state/base/base.h" +#include "orte/mca/state/base/state_private.h" +#include "state_hnp.h" + +/* + * Module functions: Global + */ +static int init(void); +static int finalize(void); + +/****************** + * HNP module - just uses base functions after + * initializing the proc state machine. Job state + * machine is unused by hnplication procs at this + * time. + ******************/ +orte_state_base_module_t orte_state_hnp_module = { + init, + finalize, + orte_state_base_activate_job_state, + orte_state_base_add_job_state, + orte_state_base_set_job_state_callback, + orte_state_base_set_job_state_priority, + orte_state_base_remove_job_state, + orte_state_base_activate_proc_state, + orte_state_base_add_proc_state, + orte_state_base_set_proc_state_callback, + orte_state_base_set_proc_state_priority, + orte_state_base_remove_proc_state +}; + +static void ignore_cbfunc(int fd, short argc, void *cbdata) +{ + orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata; + OBJ_RELEASE(state); +} + +static void track_procs(int fd, short argc, void *cbdata); +static void check_all_complete(int fd, short argc, void *cbdata); + +/* defined default state machine sequence - individual + * plm's must add a state for launching daemons + */ +static orte_job_state_t launch_states[] = { + ORTE_JOB_STATE_INIT, + ORTE_JOB_STATE_ALLOCATE, + ORTE_JOB_STATE_DAEMONS_LAUNCHED, + ORTE_JOB_STATE_DAEMONS_REPORTED, + ORTE_JOB_STATE_MAP, + ORTE_JOB_STATE_SYSTEM_PREP, + ORTE_JOB_STATE_LAUNCH_APPS, + ORTE_JOB_STATE_LOCAL_LAUNCH_COMPLETE, + ORTE_JOB_STATE_RUNNING, + ORTE_JOB_STATE_REGISTERED, + /* termination states */ + ORTE_JOB_STATE_TERMINATED, + ORTE_JOB_STATE_ALL_JOBS_COMPLETE, + ORTE_JOB_STATE_DAEMONS_TERMINATED +}; +static orte_state_cbfunc_t launch_callbacks[] = { + orte_plm_base_setup_job, + orte_ras_base_allocate, + orte_plm_base_daemons_launched, + orte_plm_base_daemons_reported, + orte_rmaps_base_map_job, + orte_plm_base_complete_setup, + orte_plm_base_launch_apps, + ignore_cbfunc, /* HNP doesn't need to process local_launch_complete */ + orte_plm_base_post_launch, + orte_plm_base_registered, + check_all_complete, + orte_quit, + orte_quit +}; + +static orte_proc_state_t proc_states[] = { + ORTE_PROC_STATE_RUNNING, + ORTE_PROC_STATE_REGISTERED, + ORTE_PROC_STATE_IOF_COMPLETE, + ORTE_PROC_STATE_WAITPID_FIRED, + ORTE_PROC_STATE_TERMINATED +}; +static orte_state_cbfunc_t proc_callbacks[] = { + track_procs, + track_procs, + track_procs, + track_procs, + track_procs +}; + +/************************ + * API Definitions + ************************/ +static int init(void) +{ + int i, rc; + int num_states; + + /* setup the state machines */ + OBJ_CONSTRUCT(&orte_job_states, opal_list_t); + OBJ_CONSTRUCT(&orte_proc_states, opal_list_t); + + /* setup the job state machine */ + num_states = sizeof(launch_states) / sizeof(orte_job_state_t); + for (i=0; i < num_states; i++) { + if (ORTE_SUCCESS != (rc = orte_state.add_job_state(launch_states[i], + launch_callbacks[i], + ORTE_SYS_PRI))) { + ORTE_ERROR_LOG(rc); + } + } + /* add a default error response */ + if (ORTE_SUCCESS != (rc = orte_state.add_job_state(ORTE_JOB_STATE_FORCED_EXIT, + orte_quit, ORTE_ERROR_PRI))) { + ORTE_ERROR_LOG(rc); + } + if (5 < opal_output_get_verbosity(orte_state_base_output)) { + orte_state_base_print_job_state_machine(); + } + + /* populate the proc state machine to allow us to + * track proc lifecycle changes + */ + num_states = sizeof(proc_states) / sizeof(orte_proc_state_t); + for (i=0; i < num_states; i++) { + if (ORTE_SUCCESS != (rc = orte_state.add_proc_state(proc_states[i], + proc_callbacks[i], + ORTE_SYS_PRI))) { + ORTE_ERROR_LOG(rc); + } + } + if (5 < opal_output_get_verbosity(orte_state_base_output)) { + orte_state_base_print_proc_state_machine(); + } + + return ORTE_SUCCESS; +} + +static int finalize(void) +{ + opal_list_item_t *item; + + /* cleanup the proc state machine */ + while (NULL != (item = opal_list_remove_first(&orte_proc_states))) { + OBJ_RELEASE(item); + } + OBJ_DESTRUCT(&orte_proc_states); + + return ORTE_SUCCESS; +} + +static void track_procs(int fd, short argc, void *cbdata) +{ + orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; + orte_process_name_t *proc = &caddy->name; + orte_proc_state_t state = caddy->proc_state; + orte_job_t *jdata; + orte_proc_t *pdata; + + OPAL_OUTPUT_VERBOSE((5, orte_state_base_output, + "%s state:hnp:track_procs called for proc %s state %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc), + orte_proc_state_to_str(state))); + + /* get the job object for this proc */ + if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + goto cleanup; + } + pdata = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid); + + if (ORTE_PROC_STATE_RUNNING == state) { + /* update the proc state */ + pdata->state = state; + jdata->num_launched++; + if (jdata->num_launched == jdata->num_procs) { + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_RUNNING); + } + } else if (ORTE_PROC_STATE_REGISTERED == state) { + /* update the proc state */ + pdata->state = state; + jdata->num_reported++; + if (jdata->num_reported == jdata->num_procs) { + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_REGISTERED); + } + } else if (ORTE_PROC_STATE_IOF_COMPLETE == state) { + /* update the proc state */ + pdata->state = state; + /* Release only the stdin IOF file descriptor for this child, if one + * was defined. File descriptors for the other IOF channels - stdout, + * stderr, and stddiag - were released when their associated pipes + * were cleared and closed due to termination of the process + */ + if (NULL != orte_iof.close) { + orte_iof.close(proc, ORTE_IOF_STDIN); + } + pdata->iof_complete = true; + if (pdata->waitpid_recvd) { + /* the proc has terminated */ + pdata->alive = false; + pdata->state = ORTE_PROC_STATE_TERMINATED; + /* Clean up the session directory as if we were the process + * itself. This covers the case where the process died abnormally + * and didn't cleanup its own session directory. + */ + orte_session_dir_finalize(proc); + /* track job status */ + jdata->num_terminated++; + if (jdata->num_terminated == jdata->num_procs) { + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED); + } + } + } else if (ORTE_PROC_STATE_WAITPID_FIRED == state) { + /* update the proc state */ + pdata->state = state; + pdata->waitpid_recvd = true; + if (pdata->iof_complete) { + /* the proc has terminated */ + pdata->alive = false; + pdata->state = ORTE_PROC_STATE_TERMINATED; + /* Clean up the session directory as if we were the process + * itself. This covers the case where the process died abnormally + * and didn't cleanup its own session directory. + */ + orte_session_dir_finalize(proc); + /* track job status */ + jdata->num_terminated++; + if (jdata->num_terminated == jdata->num_procs) { + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED); + } + } + } else if (ORTE_PROC_STATE_TERMINATED == state) { + /* update the proc state */ + pdata->state = state; + if (pdata->local_proc) { + /* Clean up the session directory as if we were the process + * itself. This covers the case where the process died abnormally + * and didn't cleanup its own session directory. + */ + orte_session_dir_finalize(proc); + } + /* track job status */ + jdata->num_terminated++; + if (jdata->num_terminated == jdata->num_procs) { + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED); + } + } + + cleanup: + OBJ_RELEASE(caddy); +} + +static void check_all_complete(int fd, short args, void *cbdata) +{ + orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; + orte_job_t *jdata = caddy->jdata; + + orte_proc_t *proc; + int i; + orte_std_cntr_t j; + orte_job_t *job; + orte_node_t *node; + orte_job_map_t *map; + orte_std_cntr_t index; + bool one_still_alive; + orte_vpid_t lowest=0; + char *msg; + + OPAL_OUTPUT_VERBOSE((2, orte_state_base_output, + "%s state:hnp:check_job_complete on job %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (NULL == jdata) ? "NULL" : ORTE_JOBID_PRINT(jdata->jobid))); + + if (NULL == jdata) { + /* just check to see if the daemons are complete */ + OPAL_OUTPUT_VERBOSE((2, orte_state_base_output, + "%s state:hnp:check_job_complete - received NULL job, checking daemons", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + goto CHECK_DAEMONS; + } + + /* turn off any sensor monitors on this job */ + orte_sensor.stop(jdata->jobid); + + if (0 < jdata->num_non_zero_exit && !orte_abort_non_zero_exit) { + if (!orte_report_child_jobs_separately || 1 == ORTE_LOCAL_JOBID(jdata->jobid)) { + /* update the exit code */ + ORTE_UPDATE_EXIT_STATUS(lowest); + } + + /* warn user */ + opal_output(orte_clean_output, + "-------------------------------------------------------\n" + "While %s job %s terminated normally, %d %s. Further examination may be required.\n" + "-------------------------------------------------------", + (1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "the primary" : "child", + (1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "" : ORTE_LOCAL_JOBID_PRINT(jdata->jobid), + jdata->num_non_zero_exit, + (1 == jdata->num_non_zero_exit) ? "process returned\na non-zero exit code." : + "processes returned\nnon-zero exit codes."); + } + + OPAL_OUTPUT_VERBOSE((2, orte_state_base_output, + "%s state:hnp:check_job_completed declared job %s normally terminated - checking all jobs", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(jdata->jobid))); + + /* if this job is a continuously operating one, then don't do + * anything further - just return here + */ + if (NULL != jdata && + (ORTE_JOB_CONTROL_CONTINUOUS_OP & jdata->controls || + ORTE_JOB_CONTROL_RECOVERABLE & jdata->controls)) { + goto CHECK_ALIVE; + } + + /* if the job that is being checked is the HNP, then we are + * trying to terminate the orteds. In that situation, we + * do -not- check all jobs - we simply notify the HNP + * that the orteds are complete. Also check special case + * if jdata is NULL - we want + * to definitely declare the job done if the orteds + * have completed, no matter what else may be happening. + * This can happen if a ctrl-c hits in the "wrong" place + * while launching + */ + CHECK_DAEMONS: + if (jdata == NULL || jdata->jobid == ORTE_PROC_MY_NAME->jobid) { + if (0 == orte_routed.num_routes()) { + /* orteds are done! */ + OPAL_OUTPUT_VERBOSE((2, orte_state_base_output, + "%s orteds complete - exiting", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + if (NULL == jdata) { + jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); + } + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DAEMONS_TERMINATED); + OBJ_RELEASE(caddy); + return; + } + OBJ_RELEASE(caddy); + return; + } + + /* Release the resources used by this job. Since some errmgrs may want + * to continue using resources allocated to the job as part of their + * fault recovery procedure, we only do this once the job is "complete". + * Note that an aborted/killed job -is- flagged as complete and will + * therefore have its resources released. We need to do this after + * we call the errmgr so that any attempt to restart the job will + * avoid doing so in the exact same place as the current job + */ + if (NULL != jdata->map && jdata->state == ORTE_JOB_STATE_TERMINATED) { + map = jdata->map; + for (index = 0; index < map->nodes->size; index++) { + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, index))) { + continue; + } + OPAL_OUTPUT_VERBOSE((2, orte_state_base_output, + "%s releasing procs from node %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + node->name)); + for (i = 0; i < node->procs->size; i++) { + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) { + continue; + } + if (proc->name.jobid != jdata->jobid) { + /* skip procs from another job */ + continue; + } + node->slots_inuse--; + node->num_procs--; + OPAL_OUTPUT_VERBOSE((2, orte_state_base_output, + "%s releasing proc %s from node %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&proc->name), node->name)); + /* set the entry in the node array to NULL */ + opal_pointer_array_set_item(node->procs, i, NULL); + /* release the proc once for the map entry */ + OBJ_RELEASE(proc); + } + } + OBJ_RELEASE(map); + jdata->map = NULL; + } + + CHECK_ALIVE: + /* now check to see if all jobs are done - release this jdata + * object when we find it + */ + one_still_alive = false; + for (j=1; j < orte_job_data->size; j++) { + if (NULL == (job = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, j))) { + /* since we are releasing jdata objects as we + * go, we can no longer assume that the job_data + * array is left justified + */ + continue; + } + /* if this is the job we are checking AND it normally terminated, + * then go ahead and release it. We cannot release it if it + * abnormally terminated as mpirun needs the info so it can + * report appropriately to the user + * + * NOTE: do not release the primary job (j=1) so we + * can pretty-print completion message + */ + if (NULL != jdata && job->jobid == jdata->jobid && + (jdata->state == ORTE_JOB_STATE_TERMINATED || + jdata->state == ORTE_JOB_STATE_KILLED_BY_CMD)) { + /* release this object, ensuring that the + * pointer array internal accounting + * is maintained! + */ + if (1 < j) { + opal_pointer_array_set_item(orte_job_data, j, NULL); /* ensure the array has a NULL */ + OBJ_RELEASE(jdata); + } + continue; + } + /* if the job is flagged to not be monitored, skip it */ + if (ORTE_JOB_CONTROL_DO_NOT_MONITOR & job->controls) { + continue; + } + /* when checking for job termination, we must be sure to NOT check + * our own job as it - rather obviously - has NOT terminated! + */ + if (job->num_terminated < job->num_procs) { + /* we have at least one job that is not done yet - we cannot + * just return, though, as we need to ensure we cleanout the + * job data for the job that just completed + */ + OPAL_OUTPUT_VERBOSE((2, orte_state_base_output, + "%s state:hnp:check_job_completed job %s is not terminated (%d:%d)", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(job->jobid), + job->num_terminated, job->num_procs)); + one_still_alive = true; + } + else { + OPAL_OUTPUT_VERBOSE((2, orte_state_base_output, + "%s state:hnp:check_job_completed job %s is terminated (%d vs %d [%s])", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(job->jobid), + job->num_terminated, job->num_procs, + (NULL == jdata) ? "UNKNOWN" : orte_job_state_to_str(jdata->state) )); + } + } + /* if a job is still alive, we just return */ + if (one_still_alive) { + OPAL_OUTPUT_VERBOSE((2, orte_state_base_output, + "%s state:hnp:check_job_completed at least one job is not terminated", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + OBJ_RELEASE(caddy); + return; + } + /* if we get here, then all jobs are done, so terminate */ + OPAL_OUTPUT_VERBOSE((2, orte_state_base_output, + "%s state:hnp:check_job_completed all jobs terminated", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + /* set the exit status to 0 - this will only happen if it + * wasn't already set by an error condition + */ + ORTE_UPDATE_EXIT_STATUS(0); + /* provide a notifier message if that framework is active - ignored otherwise */ + if (NULL != (job = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, 1))) { + if (0 == orte_exit_status) { + asprintf(&msg, "Job %s complete", ORTE_JOBID_PRINT(job->jobid)); + orte_notifier.log(ORTE_NOTIFIER_INFO, 0, msg); + } else { + asprintf(&msg, "Job %s terminated abnormally", ORTE_JOBID_PRINT(job->jobid)); + orte_notifier.log(ORTE_NOTIFIER_ALERT, orte_exit_status, msg); + } + free(msg); + /* this job object will be release during finalize */ + } + + /* order daemon termination - this tells us to cleanup + * our local procs as well as telling remote daemons + * to die + */ + orte_plm.terminate_orteds(); + + OBJ_RELEASE(caddy); +} diff --git a/orte/mca/state/hnp/state_hnp.h b/orte/mca/state/hnp/state_hnp.h new file mode 100644 index 0000000000..dfbd0ed805 --- /dev/null +++ b/orte/mca/state/hnp/state_hnp.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2011 Los Alamos National Security, LLC. + * All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/** + * @file + * + */ + +#ifndef MCA_STATE_HNP_EXPORT_H +#define MCA_STATE_HNP_EXPORT_H + +#include "orte_config.h" + +#include "orte/mca/state/state.h" + +BEGIN_C_DECLS + +/* + * Local Component structures + */ + +ORTE_MODULE_DECLSPEC extern orte_state_base_component_t mca_state_hnp_component; + +ORTE_DECLSPEC extern orte_state_base_module_t orte_state_hnp_module; + +END_C_DECLS + +#endif /* MCA_STATE_HNP_EXPORT_H */ diff --git a/orte/mca/state/hnp/state_hnp_component.c b/orte/mca/state/hnp/state_hnp_component.c new file mode 100644 index 0000000000..0c6c147aaa --- /dev/null +++ b/orte/mca/state/hnp/state_hnp_component.c @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2011 Los Alamos National Security, LLC. + * All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" +#include "opal/util/output.h" + +#include "orte/mca/state/state.h" +#include "orte/mca/state/base/base.h" +#include "state_hnp.h" + +/* + * Public string for version number + */ +const char *orte_state_hnp_component_version_string = + "ORTE STATE hnp MCA component version " ORTE_VERSION; + +/* + * Local functionality + */ +static int state_hnp_open(void); +static int state_hnp_close(void); +static int state_hnp_component_query(mca_base_module_t **module, int *priority); + +/* + * Instantiate the public struct with all of our public information + * and pointer to our public functions in it + */ +orte_state_base_component_t mca_state_hnp_component = +{ + /* Handle the general mca_component_t struct containing + * meta information about the component + */ + { + ORTE_STATE_BASE_VERSION_1_0_0, + /* Component name and version */ + "hnp", + ORTE_MAJOR_VERSION, + ORTE_MINOR_VERSION, + ORTE_RELEASE_VERSION, + + /* Component open and close functions */ + state_hnp_open, + state_hnp_close, + state_hnp_component_query + }, + { + /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_CHECKPOINT + }, +}; + +static int my_priority=1000; + +static int state_hnp_open(void) +{ + return ORTE_SUCCESS; +} + +static int state_hnp_close(void) +{ + return ORTE_SUCCESS; +} + +static int state_hnp_component_query(mca_base_module_t **module, int *priority) +{ + if (ORTE_PROC_IS_HNP) { + /* set our priority high as we are the default for hnps */ + *priority = my_priority; + *module = (mca_base_module_t *)&orte_state_hnp_module; + return ORTE_SUCCESS; + } + + *priority = -1; + *module = NULL; + return ORTE_ERROR; +} diff --git a/orte/mca/state/orted/.windows b/orte/mca/state/orted/.windows new file mode 100644 index 0000000000..aa7d7bbbe5 --- /dev/null +++ b/orte/mca/state/orted/.windows @@ -0,0 +1,12 @@ +# +# Copyright (c) 2008-2010 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# Specific to this module +mca_link_libraries=libopen-rte diff --git a/orte/mca/state/orted/Makefile.am b/orte/mca/state/orted/Makefile.am new file mode 100644 index 0000000000..f9cedb2dd1 --- /dev/null +++ b/orte/mca/state/orted/Makefile.am @@ -0,0 +1,35 @@ +# +# Copyright (c) 2011 Los Alamos National Security, LLC. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +sources = \ + state_orted.h \ + state_orted_component.c \ + state_orted.c + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +if MCA_BUILD_orte_state_orted_DSO +component_noinst = +component_install = mca_state_orted.la +else +component_noinst = libmca_state_orted.la +component_install = +endif + +mcacomponentdir = $(pkglibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_state_orted_la_SOURCES = $(sources) +mca_state_orted_la_LDFLAGS = -module -avoid-version + +noinst_LTLIBRARIES = $(component_noinst) +libmca_state_orted_la_SOURCES =$(sources) +libmca_state_orted_la_LDFLAGS = -module -avoid-version diff --git a/orte/mca/errmgr/hnp/configure.m4 b/orte/mca/state/orted/configure.m4 similarity index 54% rename from orte/mca/errmgr/hnp/configure.m4 rename to orte/mca/state/orted/configure.m4 index 440d03515b..0a5965372b 100644 --- a/orte/mca/errmgr/hnp/configure.m4 +++ b/orte/mca/state/orted/configure.m4 @@ -8,12 +8,12 @@ # # $HEADER$ # -# MCA_errmgr_hnp_CONFIG([action-if-found], [action-if-not-found]) +# MCA_state_orted_CONFIG([action-if-found], [action-if-not-found]) # ----------------------------------------------------------- -AC_DEFUN([MCA_orte_errmgr_hnp_CONFIG], [ - AC_CONFIG_FILES([orte/mca/errmgr/hnp/Makefile]) +AC_DEFUN([MCA_orte_state_orted_CONFIG], [ + AC_CONFIG_FILES([orte/mca/state/orted/Makefile]) - AS_IF([test "$orte_enable_resilient_code" = 1 -a "$orte_without_full_support" = 0], + AS_IF([test "$orte_without_full_support" = 0], [$1], [$2]) ]) diff --git a/orte/mca/state/orted/state_orted.c b/orte/mca/state/orted/state_orted.c new file mode 100644 index 0000000000..5fc4628cf3 --- /dev/null +++ b/orte/mca/state/orted/state_orted.c @@ -0,0 +1,451 @@ +/* + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" + +#include +#ifdef HAVE_UNISTD_H +#include +#endif /* HAVE_UNISTD_H */ +#ifdef HAVE_STRING_H +#include +#endif + +#include "opal/util/output.h" + +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/iof/iof.h" +#include "orte/mca/rml/rml.h" +#include "orte/util/session_dir.h" +#include "orte/runtime/orte_quit.h" + +#include "orte/mca/state/state.h" +#include "orte/mca/state/base/base.h" +#include "orte/mca/state/base/state_private.h" +#include "state_orted.h" + +/* + * Module functions: Global + */ +static int init(void); +static int finalize(void); + +/****************** + * ORTED module - just uses base functions after + * initializing the proc state machine. Job state + * machine is unused by ortedlication procs at this + * time. + ******************/ +orte_state_base_module_t orte_state_orted_module = { + init, + finalize, + orte_state_base_activate_job_state, + orte_state_base_add_job_state, + orte_state_base_set_job_state_callback, + orte_state_base_set_job_state_priority, + orte_state_base_remove_job_state, + orte_state_base_activate_proc_state, + orte_state_base_add_proc_state, + orte_state_base_set_proc_state_callback, + orte_state_base_set_proc_state_priority, + orte_state_base_remove_proc_state +}; + +/* Local functions */ +static void track_jobs(int fd, short argc, void *cbdata); +static void track_procs(int fd, short argc, void *cbdata); +static int pack_state_update(opal_buffer_t *buf, orte_job_t *jdata); +static int pack_child_contact_info(orte_jobid_t jobid, opal_buffer_t *buf); + +/* defined default state machines */ +static orte_job_state_t job_states[] = { + ORTE_JOB_STATE_LOCAL_LAUNCH_COMPLETE, +}; +static orte_state_cbfunc_t job_callbacks[] = { + track_jobs +}; + +static orte_proc_state_t proc_states[] = { + ORTE_PROC_STATE_RUNNING, + ORTE_PROC_STATE_REGISTERED, + ORTE_PROC_STATE_IOF_COMPLETE, + ORTE_PROC_STATE_WAITPID_FIRED +}; +static orte_state_cbfunc_t proc_callbacks[] = { + track_procs, + track_procs, + track_procs, + track_procs +}; + +/************************ + * API Definitions + ************************/ +static int init(void) +{ + int num_states, i, rc; + + /* setup the state machine */ + OBJ_CONSTRUCT(&orte_job_states, opal_list_t); + OBJ_CONSTRUCT(&orte_proc_states, opal_list_t); + + num_states = sizeof(job_states) / sizeof(orte_job_state_t); + for (i=0; i < num_states; i++) { + if (ORTE_SUCCESS != (rc = orte_state.add_job_state(job_states[i], + job_callbacks[i], + ORTE_SYS_PRI))) { + ORTE_ERROR_LOG(rc); + } + } + /* add a default error response */ + if (ORTE_SUCCESS != (rc = orte_state.add_job_state(ORTE_JOB_STATE_FORCED_EXIT, + orte_quit, ORTE_ERROR_PRI))) { + ORTE_ERROR_LOG(rc); + } + /* add a state for when we are ordered to terminate */ + if (ORTE_SUCCESS != (rc = orte_state.add_job_state(ORTE_JOB_STATE_DAEMONS_TERMINATED, + orte_quit, ORTE_ERROR_PRI))) { + ORTE_ERROR_LOG(rc); + } + if (5 < opal_output_get_verbosity(orte_state_base_output)) { + orte_state_base_print_job_state_machine(); + } + + /* populate the proc state machine to allow us to + * track proc lifecycle changes + */ + num_states = sizeof(proc_states) / sizeof(orte_proc_state_t); + for (i=0; i < num_states; i++) { + if (ORTE_SUCCESS != (rc = orte_state.add_proc_state(proc_states[i], + proc_callbacks[i], + ORTE_SYS_PRI))) { + ORTE_ERROR_LOG(rc); + } + } + if (5 < opal_output_get_verbosity(orte_state_base_output)) { + orte_state_base_print_proc_state_machine(); + } + return ORTE_SUCCESS; +} + +static int finalize(void) +{ + opal_list_item_t *item; + + /* cleanup the state machines */ + while (NULL != (item = opal_list_remove_first(&orte_job_states))) { + OBJ_RELEASE(item); + } + OBJ_DESTRUCT(&orte_job_states); + while (NULL != (item = opal_list_remove_first(&orte_proc_states))) { + OBJ_RELEASE(item); + } + OBJ_DESTRUCT(&orte_proc_states); + + return ORTE_SUCCESS; +} + +static void track_jobs(int fd, short argc, void *cbdata) +{ + orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; + opal_buffer_t *alert; + orte_plm_cmd_flag_t cmd; + int rc; + + if (ORTE_JOB_STATE_LOCAL_LAUNCH_COMPLETE == caddy->job_state) { + /* update the HNP with all proc states for this job */ + alert = OBJ_NEW(opal_buffer_t); + /* pack update state command */ + cmd = ORTE_PLM_UPDATE_PROC_STATE; + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(alert); + goto cleanup; + } + /* pack the job info */ + if (ORTE_SUCCESS != (rc = pack_state_update(alert, caddy->jdata))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(alert); + goto cleanup; + } + /* send it */ + if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, + ORTE_RML_TAG_PLM, 0, + orte_rml_send_callback, NULL))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(alert); + } + } + + cleanup: + OBJ_RELEASE(caddy); +} + +static void track_procs(int fd, short argc, void *cbdata) +{ + orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; + orte_process_name_t *proc = &caddy->name; + orte_proc_state_t state = caddy->proc_state; + orte_job_t *jdata; + orte_proc_t *pdata, *pptr; + opal_buffer_t *alert; + int rc, i; + orte_plm_cmd_flag_t cmd; + orte_vpid_t null=ORTE_VPID_INVALID; + + OPAL_OUTPUT_VERBOSE((5, orte_state_base_output, + "%s state:orted:track_procs called for proc %s state %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc), + orte_proc_state_to_str(state))); + + /* get the job object for this proc */ + if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + goto cleanup; + } + pdata = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid); + + if (ORTE_PROC_STATE_RUNNING == state) { + /* update the proc state */ + pdata->state = state; + jdata->num_launched++; + /* don't update until we are told that all are done */ + } else if (ORTE_PROC_STATE_REGISTERED == state) { + /* update the proc state */ + pdata->state = state; + jdata->num_reported++; + if (jdata->num_reported == jdata->num_local_procs) { + /* once everyone registers, send their contact info to + * the HNP so it is available to debuggers and anyone + * else that needs it + */ + + OPAL_OUTPUT_VERBOSE((5, orte_state_base_output, + "%s state:orted: sending contact info to HNP", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + + alert = OBJ_NEW(opal_buffer_t); + /* pack init routes command */ + cmd = ORTE_PLM_INIT_ROUTES_CMD; + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + /* pack the jobid */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + /* pack all the local child vpids and epochs */ + for (i=0; i < orte_local_children->size; i++) { + if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { + continue; + } + if (pptr->name.jobid == proc->jobid) { + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &pptr->name.vpid, 1, ORTE_VPID))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + } + } + /* pack an invalid marker */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &null, 1, ORTE_VPID))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + /* add in contact info for all procs in the job */ + if (ORTE_SUCCESS != (rc = pack_child_contact_info(proc->jobid, alert))) { + ORTE_ERROR_LOG(rc); + OBJ_DESTRUCT(&alert); + goto cleanup; + } + /* send it */ + if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, 0, orte_rml_send_callback, NULL))) { + ORTE_ERROR_LOG(rc); + } else { + rc = ORTE_SUCCESS; + } + } + } else if (ORTE_PROC_STATE_IOF_COMPLETE == state) { + /* do NOT update the proc state as this can hit + * while we are still trying to notify the HNP of + * successful launch for short-lived procs + */ + /* Release only the stdin IOF file descriptor for this child, if one + * was defined. File descriptors for the other IOF channels - stdout, + * stderr, and stddiag - were released when their associated pipes + * were cleared and closed due to termination of the process + */ + if (NULL != orte_iof.close) { + orte_iof.close(proc, ORTE_IOF_STDIN); + } + pdata->iof_complete = true; + if (pdata->waitpid_recvd) { + /* the proc has terminated */ + pdata->alive = false; + pdata->state = ORTE_PROC_STATE_TERMINATED; + /* Clean up the session directory as if we were the process + * itself. This covers the case where the process died abnormally + * and didn't cleanup its own session directory. + */ + orte_session_dir_finalize(proc); + /* track job status */ + jdata->num_terminated++; + if (jdata->num_terminated == jdata->num_local_procs) { + /* pack update state command */ + cmd = ORTE_PLM_UPDATE_PROC_STATE; + alert = OBJ_NEW(opal_buffer_t); + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + /* pack the job info */ + if (ORTE_SUCCESS != (rc = pack_state_update(alert, jdata))) { + ORTE_ERROR_LOG(rc); + } + /* send it */ + OPAL_OUTPUT_VERBOSE((5, orte_state_base_output, + "%s SENDING PROC TERMINATION UPDATE FOR JOB %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(jdata->jobid))); + if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, + ORTE_RML_TAG_PLM, 0, + orte_rml_send_callback, NULL))) { + ORTE_ERROR_LOG(rc); + } + } + } + } else if (ORTE_PROC_STATE_WAITPID_FIRED == state) { + /* do NOT update the proc state as this can hit + * while we are still trying to notify the HNP of + * successful launch for short-lived procs + */ + pdata->waitpid_recvd = true; + if (pdata->iof_complete) { + /* the proc has terminated */ + pdata->alive = false; + pdata->state = ORTE_PROC_STATE_TERMINATED; + /* Clean up the session directory as if we were the process + * itself. This covers the case where the process died abnormally + * and didn't cleanup its own session directory. + */ + orte_session_dir_finalize(proc); + /* track job status */ + jdata->num_terminated++; + if (jdata->num_terminated == jdata->num_local_procs) { + /* pack update state command */ + cmd = ORTE_PLM_UPDATE_PROC_STATE; + alert = OBJ_NEW(opal_buffer_t); + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + /* pack the job info */ + if (ORTE_SUCCESS != (rc = pack_state_update(alert, jdata))) { + ORTE_ERROR_LOG(rc); + } + /* send it */ + OPAL_OUTPUT_VERBOSE((5, orte_state_base_output, + "%s SENDING PROC TERMINATION UPDATE FOR JOB %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(jdata->jobid))); + if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, + ORTE_RML_TAG_PLM, 0, + orte_rml_send_callback, NULL))) { + ORTE_ERROR_LOG(rc); + } + } + } + } + + cleanup: + OBJ_RELEASE(caddy); +} + +static int pack_state_for_proc(opal_buffer_t *alert, orte_proc_t *child) +{ + int rc; + + /* pack the child's vpid */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &(child->name.vpid), 1, ORTE_VPID))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* pack the pid */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &child->pid, 1, OPAL_PID))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* pack its state */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &child->state, 1, ORTE_PROC_STATE))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* pack its exit code */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &child->exit_code, 1, ORTE_EXIT_CODE))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + return ORTE_SUCCESS; +} + +static int pack_state_update(opal_buffer_t *alert, orte_job_t *jdata) +{ + int i, rc; + orte_proc_t *child; + orte_vpid_t null=ORTE_VPID_INVALID; + + /* pack the jobid */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &jdata->jobid, 1, ORTE_JOBID))) { + ORTE_ERROR_LOG(rc); + return rc; + } + for (i=0; i < orte_local_children->size; i++) { + if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { + continue; + } + /* if this child is part of the job... */ + if (child->name.jobid == jdata->jobid) { + if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) { + ORTE_ERROR_LOG(rc); + return rc; + } + } + } + /* flag that this job is complete so the receiver can know */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &null, 1, ORTE_VPID))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + return ORTE_SUCCESS; +} + +static int pack_child_contact_info(orte_jobid_t jobid, opal_buffer_t *buf) +{ + int i, rc; + orte_proc_t *pptr; + + for (i=0; i < orte_local_children->size; i++) { + if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { + continue; + } + if (jobid == pptr->name.jobid) { + if (OPAL_SUCCESS != (rc = opal_dss.pack(buf, &pptr->rml_uri, 1, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + return rc; + } + } + } + + return ORTE_SUCCESS; +} diff --git a/orte/mca/state/orted/state_orted.h b/orte/mca/state/orted/state_orted.h new file mode 100644 index 0000000000..25e722625c --- /dev/null +++ b/orte/mca/state/orted/state_orted.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2011 Los Alamos National Security, LLC. + * All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/** + * @file + * + */ + +#ifndef MCA_STATE_ORTED_EXPORT_H +#define MCA_STATE_ORTED_EXPORT_H + +#include "orte_config.h" + +#include "orte/mca/state/state.h" + +BEGIN_C_DECLS + +/* + * Local Component structures + */ + +ORTE_MODULE_DECLSPEC extern orte_state_base_component_t mca_state_orted_component; + +ORTE_DECLSPEC extern orte_state_base_module_t orte_state_orted_module; + +END_C_DECLS + +#endif /* MCA_STATE_ORTED_EXPORT_H */ diff --git a/orte/mca/state/orted/state_orted_component.c b/orte/mca/state/orted/state_orted_component.c new file mode 100644 index 0000000000..c74c859bc6 --- /dev/null +++ b/orte/mca/state/orted/state_orted_component.c @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2011 Los Alamos National Security, LLC. + * All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" +#include "opal/util/output.h" + +#include "orte/mca/state/state.h" +#include "orte/mca/state/base/base.h" +#include "state_orted.h" + +/* + * Public string for version number + */ +const char *orte_state_orted_component_version_string = + "ORTE STATE orted MCA component version " ORTE_VERSION; + +/* + * Local functionality + */ +static int state_orted_open(void); +static int state_orted_close(void); +static int state_orted_component_query(mca_base_module_t **module, int *priority); + +/* + * Instantiate the public struct with all of our public information + * and pointer to our public functions in it + */ +orte_state_base_component_t mca_state_orted_component = +{ + /* Handle the general mca_component_t struct containing + * meta information about the component + */ + { + ORTE_STATE_BASE_VERSION_1_0_0, + /* Component name and version */ + "orted", + ORTE_MAJOR_VERSION, + ORTE_MINOR_VERSION, + ORTE_RELEASE_VERSION, + + /* Component open and close functions */ + state_orted_open, + state_orted_close, + state_orted_component_query + }, + { + /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_CHECKPOINT + }, +}; + +static int my_priority=1000; + +static int state_orted_open(void) +{ + return ORTE_SUCCESS; +} + +static int state_orted_close(void) +{ + return ORTE_SUCCESS; +} + +static int state_orted_component_query(mca_base_module_t **module, int *priority) +{ + if (ORTE_PROC_IS_DAEMON) { + /* set our priority high as we are the default for orteds */ + *priority = my_priority; + *module = (mca_base_module_t *)&orte_state_orted_module; + return ORTE_SUCCESS; + } + + *priority = -1; + *module = NULL; + return ORTE_ERROR; +} diff --git a/orte/mca/state/state.h b/orte/mca/state/state.h new file mode 100644 index 0000000000..cbc2d357f5 --- /dev/null +++ b/orte/mca/state/state.h @@ -0,0 +1,273 @@ +/* + * Copyright (c) 2011 Los Alamos National Security, LLC. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/**** ORTE STATE MACHINE ****/ + +/* States are treated as events so that the event + * library can sequence them. Each state consists + * of an event, a job or process state, a pointer + * to the respective object, and a callback function + * to be executed for that state. Events can be defined + * at different priorities - e.g., SYS priority for + * events associated with launching jobs, and ERR priority + * for events associated with abnormal termination of + * a process. + * + * The state machine consists of a list of state objects, + * each defining a state-cbfunc pair. At startup, a default + * list is created by the base functions which is then + * potentially customized by selected components within + * the various ORTE frameworks. For example, a PLM component + * may need to insert states in the launch procedure, or may + * want to redirect a particular state callback to a custom + * function. + * + * For convenience, an ANY state can be defined along with a generic + * callback function, with the corresponding state object + * placed at the end of the state machine. Setting the + * machine to a state that has not been explicitly defined + * will cause this default action to be executed. Thus, you + * don't have to explicitly define a state-cbfunc pair + * for every job or process state. + */ + +#ifndef _ORTE_STATE_H_ +#define _ORTE_STATE_H_ + +#include "orte_config.h" + +#include "opal/class/opal_list.h" +#include "opal/mca/event/event.h" + +#include "orte/mca/plm/plm_types.h" +#include "orte/runtime/orte_globals.h" + +#include "orte/mca/state/state_types.h" + +BEGIN_C_DECLS + +/* while unusual, we need to make the state framework's verbosity + * available here so that we can use it in the state machine + * macros + */ +ORTE_DECLSPEC extern int orte_state_base_output; + +/* For ease in debugging the state machine, it is STRONGLY recommended + * that the functions be accessed using the following macros + */ +#define ORTE_TERMINATE(x) \ + do { \ + ORTE_UPDATE_EXIT_STATUS(x); \ + orte_state.activate_job_state(NULL, \ + ORTE_JOB_STATE_FORCED_EXIT); \ + } while(0); + +#define ORTE_ACTIVATE_JOB_STATE(j, s) \ + do { \ + orte_job_t *shadow=(j); \ + OPAL_OUTPUT_VERBOSE((1, orte_state_base_output, \ + "%s ACTIVATE JOB %s STATE %s AT %s:%d", \ + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \ + (NULL == shadow) ? "NULL" : \ + ORTE_JOBID_PRINT(shadow->jobid), \ + orte_job_state_to_str((s)), \ + __FILE__, __LINE__)); \ + orte_state.activate_job_state((j), (s)); \ + } while(0); + +#define ORTE_ACTIVATE_PROC_STATE(p, s) \ + do { \ + orte_process_name_t *shadow=(p); \ + OPAL_OUTPUT_VERBOSE((1, orte_state_base_output, \ + "%s ACTIVATE PROC %s STATE %s AT %s:%d", \ + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \ + (NULL == shadow) ? "NULL" : \ + ORTE_NAME_PRINT(shadow), \ + orte_proc_state_to_str((s)), \ + __FILE__, __LINE__)); \ + orte_state.activate_proc_state((p), (s)); \ + } while(0); + +/** + * Module initialization function. + * + * @retval ORTE_SUCCESS The operation completed successfully + * @retval ORTE_ERROR An unspecifed error occurred + */ +typedef int (*orte_state_base_module_init_fn_t)(void); + +/** + * Module finalization function. + * + * @retval ORTE_SUCCESS The operation completed successfully + * @retval ORTE_ERROR An unspecifed error occurred + */ +typedef int (*orte_state_base_module_finalize_fn_t)(void); + +/**** JOB STATE APIs ****/ +/* Job states are accessed via orte_job_t objects as they are only + * used in ORTE tools and not application processes. APIs are provided + * for assembling and editing the state machine, as well as activating + * a specific job state + * + * Note the inherent assumption in this design that any customization + * of the state machine will at least start with the base states - i.e., + * that one would start with the default machine and edit it to add, + * remove, or modify callbacks as required. Alternatively, one could + * just clear the list entirely and assemble a fully custom state + * machine - both models are supported. + */ + +/* Activate a state in the job state machine. + * + * Creates and activates an event with the callback corresponding to the + * specified job state. If the specified state is not found: + * + * 1. if a state machine entry for ORTE_JOB_STATE_ERROR was given, and + * the state is an error state (i.e., ORTE_JOB_STATE_ERROR <= state), + * then the callback for the ERROR state will be used + * + * 2. if a state machine entry for ORTE_JOB_STATE_ANY was given, and + * the state is not an error state (i.e., state < ORTE_JOB_STATE_ERROR), + * then the callback for the ANY state will be used + * + * 3. if neither of the above is true, then the call will be ignored. + */ +typedef void (*orte_state_base_module_activate_job_state_fn_t)(orte_job_t *jdata, + orte_job_state_t state); + +/* Add a state to the job state machine. + * + */ +typedef int (*orte_state_base_module_add_job_state_fn_t)(orte_job_state_t state, + orte_state_cbfunc_t cbfunc, + int priority); + +/* Set the callback function for a state in the job state machine. + * + */ +typedef int (*orte_state_base_module_set_job_state_callback_fn_t)(orte_job_state_t state, + orte_state_cbfunc_t cbfunc); + +/* Set the event priority for a state in the job state machine. + * + */ +typedef int (*orte_state_base_module_set_job_state_priority_fn_t)(orte_job_state_t state, + int priority); + +/* Remove a state from the job state machine. + * + */ +typedef int (*orte_state_base_module_remove_job_state_fn_t)(orte_job_state_t state); + + +/**** Proc STATE APIs ****/ +/* Proc states are accessed via orte_process_name_t as the state machine + * must be available to both application processes and ORTE tools. APIs are + * providedfor assembling and editing the state machine, as well as activating + * a specific proc state + * + * Note the inherent assumption in this design that any customization + * of the state machine will at least start with the base states - i.e., + * that one would start with the default machine and edit it to add, + * remove, or modify callbacks as required. Alternatively, one could + * just clear the list entirely and assemble a fully custom state + * machine - both models are supported. + */ + +/* Activate a proc state. + * + * Creates and activates an event with the callback corresponding to the + * specified proc state. If the specified state is not found: + * + * 1. if a state machine entry for ORTE_PROC_STATE_ERROR was given, and + * the state is an error state (i.e., ORTE_PROC_STATE_ERROR <= state), + * then the callback for the ERROR state will be used + * + * 2. if a state machine entry for ORTE_PROC_STATE_ANY was given, and + * the state is not an error state (i.e., state < ORTE_PROC_STATE_ERROR), + * then the callback for the ANY state will be used + * + * 3. if neither of the above is true, then the call will be ignored. + */ +typedef void (*orte_state_base_module_activate_proc_state_fn_t)(orte_process_name_t *proc, + orte_proc_state_t state); + +/* Add a state to the proc state machine. + * + */ +typedef int (*orte_state_base_module_add_proc_state_fn_t)(orte_proc_state_t state, + orte_state_cbfunc_t cbfunc, + int priority); + +/* Set the callback function for a state in the proc state machine. + * + */ +typedef int (*orte_state_base_module_set_proc_state_callback_fn_t)(orte_proc_state_t state, + orte_state_cbfunc_t cbfunc); + +/* Set the event priority for a state in the proc state machine. + * + */ +typedef int (*orte_state_base_module_set_proc_state_priority_fn_t)(orte_proc_state_t state, + int priority); + +/* Remove a state from the proc state machine. + * + */ +typedef int (*orte_state_base_module_remove_proc_state_fn_t)(orte_proc_state_t state); + + +/* + * Module Structure + */ +struct orte_state_base_module_1_0_0_t { + /** Initialization Function */ + orte_state_base_module_init_fn_t init; + /** Finalization Function */ + orte_state_base_module_finalize_fn_t finalize; + /* Job state APIs */ + orte_state_base_module_activate_job_state_fn_t activate_job_state; + orte_state_base_module_add_job_state_fn_t add_job_state; + orte_state_base_module_set_job_state_callback_fn_t set_job_state_callback; + orte_state_base_module_set_job_state_priority_fn_t set_job_state_priority; + orte_state_base_module_remove_job_state_fn_t remove_job_state; + /* Proc state APIs */ + orte_state_base_module_activate_proc_state_fn_t activate_proc_state; + orte_state_base_module_add_proc_state_fn_t add_proc_state; + orte_state_base_module_set_proc_state_callback_fn_t set_proc_state_callback; + orte_state_base_module_set_proc_state_priority_fn_t set_proc_state_priority; + orte_state_base_module_remove_proc_state_fn_t remove_proc_state; +}; +typedef struct orte_state_base_module_1_0_0_t orte_state_base_module_1_0_0_t; +typedef orte_state_base_module_1_0_0_t orte_state_base_module_t; +ORTE_DECLSPEC extern orte_state_base_module_t orte_state; + +/* + * State Component + */ +struct orte_state_base_component_1_0_0_t { + /** MCA base component */ + mca_base_component_t base_version; + /** MCA base data */ + mca_base_component_data_t base_data; +}; +typedef struct orte_state_base_component_1_0_0_t orte_state_base_component_1_0_0_t; +typedef orte_state_base_component_1_0_0_t orte_state_base_component_t; + +/* + * Macro for use in components that are of type state + */ +#define ORTE_STATE_BASE_VERSION_1_0_0 \ + MCA_BASE_VERSION_2_0_0, \ + "state", 1, 0, 0 + +END_C_DECLS +#endif diff --git a/orte/mca/state/state_types.h b/orte/mca/state/state_types.h new file mode 100644 index 0000000000..c4c8442685 --- /dev/null +++ b/orte/mca/state/state_types.h @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2011 Los Alamos National Security, LLC. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/**** ORTE STATE MACHINE ****/ + +/* States are treated as events so that the event + * library can sequence them. Each state consists + * of an event, a job or process state, a pointer + * to the respective object, and a callback function + * to be executed for that state. Events can be defined + * at different priorities - e.g., SYS priority for + * events associated with launching jobs, and ERR priority + * for events associated with abnormal termination of + * a process. + * + * The state machine consists of a list of state objects, + * each defining a state-cbfunc pair. At startup, a default + * list is created by the base functions which is then + * potentially customized by selected components within + * the various ORTE frameworks. For example, a PLM component + * may need to insert states in the launch procedure, or may + * want to redirect a particular state callback to a custom + * function. + * + * For convenience, an ANY state can be defined along with a generic + * callback function, with the corresponding state object + * placed at the end of the state machine. Setting the + * machine to a state that has not been explicitly defined + * will cause this default action to be executed. Thus, you + * don't have to explicitly define a state-cbfunc pair + * for every job or process state. + */ + +#ifndef _ORTE_STATE_TYPES_H_ +#define _ORTE_STATE_TYPES_H_ + +#include "orte_config.h" + +#include "opal/class/opal_list.h" +#include "opal/mca/event/event.h" + +#include "orte/mca/plm/plm_types.h" +#include "orte/runtime/orte_globals.h" + +BEGIN_C_DECLS + +typedef void (*orte_state_cbfunc_t)(int fd, short args, void* cb); + +typedef struct { + opal_list_item_t super; + orte_job_state_t job_state; + orte_proc_state_t proc_state; + orte_state_cbfunc_t cbfunc; + int priority; +} orte_state_t; +ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_state_t); + +/* caddy for passing job and proc data to state event handlers */ +typedef struct { + opal_object_t super; + opal_event_t ev; + orte_job_t *jdata; + orte_job_state_t job_state; + orte_process_name_t name; + orte_proc_state_t proc_state; +} orte_state_caddy_t; +ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_state_caddy_t); + +END_C_DECLS +#endif diff --git a/orte/orted/orted_comm.c b/orte/orted/orted_comm.c index e559f064a4..c0b639f8d0 100644 --- a/orte/orted/orted_comm.c +++ b/orte/orted/orted_comm.c @@ -68,6 +68,7 @@ #include "orte/mca/plm/base/plm_private.h" #include "orte/mca/routed/routed.h" #include "orte/mca/ess/ess.h" +#include "orte/mca/state/state.h" #include "orte/mca/odls/base/odls_private.h" @@ -88,273 +89,9 @@ struct timeval orte_daemon_msg_recvd; static opal_pointer_array_t *procs_prev_ordered_to_terminate = NULL; -static struct timeval mesg_recvd={0,0}; - -static void send_relay(opal_buffer_t *buf) -{ - opal_list_t recips; - opal_list_item_t *item; - orte_routed_tree_t *nm; - orte_process_name_t target; - int ret; - - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, - "%s orte:daemon:send_relay", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - /* get the list of next recipients from the routed module */ - OBJ_CONSTRUCT(&recips, opal_list_t); - /* ignore returned parent vpid - we don't care here */ - orte_routed.get_routing_tree(&recips); - - /* if list is empty, nothing for us to do */ - if (opal_list_is_empty(&recips)) { - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, - "%s orte:daemon:send_relay - recipient list is empty!", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - goto CLEANUP; - } - - /* send the message to each recipient on list, deconstructing it as we go */ - target.jobid = ORTE_PROC_MY_NAME->jobid; - while (NULL != (item = opal_list_remove_first(&recips))) { - nm = (orte_routed_tree_t*)item; - target.vpid = nm->vpid; - - ORTE_EPOCH_SET(target.epoch,orte_ess.proc_get_epoch(&target)); - - if (!PROC_IS_RUNNING(&target)) { - continue; - } - - ORTE_EPOCH_SET(target.epoch,orte_ess.proc_get_epoch(&target)); - - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, - "%s orte:daemon:send_relay sending relay msg to %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&target))); - - if (ORTE_SUCCESS != (ret = orte_comm(&target, buf, ORTE_RML_TAG_DAEMON, - orte_daemon_cmd_processor))) { - ORTE_ERROR_LOG(ret); - goto CLEANUP; - } - } - -CLEANUP: - /* cleanup */ - OBJ_DESTRUCT(&recips); -} - void orte_daemon_recv(int status, orte_process_name_t* sender, opal_buffer_t *buffer, orte_rml_tag_t tag, void* cbdata) -{ - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, - "%s orted_recv_cmd: received message from %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(sender))); - - if (orte_timing) { - /* pickup the time the message was recvd by this daemon */ - gettimeofday(&mesg_recvd, NULL); - } - - /* don't process this right away - we need to get out of the recv before - * we process the message as it may ask us to do something that involves - * more messaging! Instead, setup an event so that the message gets processed - * as soon as we leave the recv. - * - * The macro makes a copy of the buffer, which we release when processed - the incoming - * buffer, however, is NOT released here, although its payload IS transferred - * to the message buffer for later processing - */ - ORTE_MESSAGE_EVENT(sender, buffer, tag, orte_daemon_cmd_processor); - - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, - "%s orted_recv_cmd: reissued recv", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); -} - -static int num_recursions=0; -static int wait_time=1; -#define MAX_RECURSIONS 24 - -void orte_daemon_cmd_processor(int fd, short event, void *data) -{ - orte_message_event_t *mev = (orte_message_event_t*)data; - orte_process_name_t *sender = &(mev->sender); - opal_buffer_t *buffer = mev->buffer; - opal_buffer_t relay_buf; - orte_rml_tag_t tag = mev->tag, target_tag; - orte_jobid_t job; - int ret; - ptrdiff_t unpack_rel, save_rel; - orte_std_cntr_t n; - orte_daemon_cmd_flag_t command, cmd; - - /* check to see if we are in a progress recursion */ - if (ORTE_PROC_IS_DAEMON && 1 < (ret = opal_progress_recursion_depth())) { - /* if we are in a recursion, we want to repost the message event - * so the progress engine can work its way back up to the top - * of the stack. Given that this could happen multiple times, - * we have to be careful to increase the time we wait so that - * we provide enough time - but not more time than necessary - for - * the stack to clear - */ - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, - "%s orte:daemon:cmd:processor in recursion depth %d\n\treposting %s for tag %ld", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ret, - ORTE_NAME_PRINT(sender), - (long)(tag))); - if (MAX_RECURSIONS < num_recursions) { - /* we need to abort if we get too far down this path */ - opal_output(0, "%s ORTED_CMD_PROCESSOR: STUCK IN INFINITE LOOP - ABORTING", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - OBJ_RELEASE(mev); - /* make sure our local procs are dead */ - orte_odls.kill_local_procs(NULL); - - /* do -not- call finalize as this will send a message to the HNP - * indicating clean termination! Instead, just forcibly cleanup - * the local session_dir tree and abort - */ - orte_session_dir_cleanup(ORTE_JOBID_WILDCARD); - - abort(); - } - wait_time = wait_time * 2; - ++num_recursions; - ORTE_MESSAGE_EVENT_DELAY(wait_time, mev); - return; - } - wait_time = 1; - num_recursions = 0; - - if (orte_timing && ORTE_PROC_IS_HNP) { - /* if we are doing timing, and we are the HNP, then the message doesn't come - * through the RML recv, so we have to pickup the recv time here - */ - gettimeofday(&mesg_recvd, NULL); - } - - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, - "%s orte:daemon:cmd:processor called by %s for tag %ld", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(sender), - (long)(tag))); - - /* save the original buffer pointers */ - unpack_rel = buffer->unpack_ptr - buffer->base_ptr; - - /* unpack the initial command */ - n = 1; - if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &command, &n, ORTE_DAEMON_CMD))) { - ORTE_ERROR_LOG(ret); -#if OPAL_ENABLE_DEBUG - opal_output(0, "%s got message buffer from file %s line %d\n", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), mev->file, mev->line); -#endif - goto CLEANUP; - } - - /* see if this is a "process-and-relay" or "process" command - i.e., an xcast is underway */ - if (ORTE_DAEMON_PROCESS_AND_RELAY_CMD == command || - ORTE_DAEMON_PROCESS_CMD == command) { - /* get the target jobid and tag */ - n = 1; - if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &job, &n, ORTE_JOBID))) { - ORTE_ERROR_LOG(ret); - goto CLEANUP; - } - n = 1; - if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &target_tag, &n, ORTE_RML_TAG))) { - ORTE_ERROR_LOG(ret); - goto CLEANUP; - } - /* save this buffer location */ - save_rel = buffer->unpack_ptr - buffer->base_ptr; - /* unpack the command that will actually be executed */ - n = 1; - if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &cmd, &n, ORTE_DAEMON_CMD))) { - ORTE_ERROR_LOG(ret); - goto CLEANUP; - } - /* is this an add-procs cmd? */ - if (ORTE_DAEMON_ADD_LOCAL_PROCS == cmd) { - /* store the time the cmd was recvd */ - if (orte_timing) { - orte_daemon_msg_recvd.tv_sec = mesg_recvd.tv_sec; - orte_daemon_msg_recvd.tv_usec = mesg_recvd.tv_usec; - } - /* the cmd contains daemon update info - process it */ - if (ORTE_SUCCESS != (ret = orte_odls_base_default_update_daemon_info(buffer))) { - ORTE_ERROR_LOG(ret); - goto CLEANUP; - } - /* flag this location */ - save_rel = buffer->unpack_ptr - buffer->base_ptr; - } - - if (ORTE_DAEMON_PROCESS_AND_RELAY_CMD == command) { - /* need to relay it */ - /* setup the relay buffer */ - OBJ_CONSTRUCT(&relay_buf, opal_buffer_t); - /* rewind the buffer to the beginning */ - buffer->unpack_ptr = buffer->base_ptr + unpack_rel; - /* copy everything to the relay buffer */ - opal_dss.copy_payload(&relay_buf, buffer); - /* do the relay */ - send_relay(&relay_buf); - /* cleanup */ - OBJ_DESTRUCT(&relay_buf); - } - - /* rewind the buffer to the right place for processing the cmd */ - buffer->unpack_ptr = buffer->base_ptr + save_rel; - - /* process the command */ - if (ORTE_SUCCESS != (ret = orte_daemon_process_commands(sender, buffer, tag))) { - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, - "%s orte:daemon:cmd:processor failed on error %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_ERROR_NAME(ret))); - } - - /* done */ - goto CLEANUP; - - } else { - /* rewind the buffer so we can process it correctly */ - buffer->unpack_ptr = buffer->base_ptr + unpack_rel; - } - - /* process the command */ - if (ORTE_SUCCESS != (ret = orte_daemon_process_commands(sender, buffer, tag))) { - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, - "%s orte:daemon:cmd:processor failed on error %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_ERROR_NAME(ret))); - } - - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, - "%s orte:daemon:cmd:processor: processing commands completed", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - -CLEANUP: - OBJ_RELEASE(mev); - /* reissue the non-blocking receive */ - ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON, - ORTE_RML_NON_PERSISTENT, orte_daemon_recv, NULL); - if (ret != ORTE_SUCCESS && ret != ORTE_ERR_NOT_IMPLEMENTED) { - ORTE_ERROR_LOG(ret); - } - - return; -} - -int orte_daemon_process_commands(orte_process_name_t* sender, - opal_buffer_t *buffer, - orte_rml_tag_t tag) { orte_daemon_cmd_flag_t command; opal_buffer_t *relay_msg; @@ -368,7 +105,6 @@ int orte_daemon_process_commands(orte_process_name_t* sender, orte_rml_cmd_flag_t rml_cmd; orte_job_t *jdata; orte_process_name_t proc, proc2; - int32_t status; orte_process_name_t *return_addr; int32_t i, num_replies; bool hnp_accounted_for; @@ -384,7 +120,7 @@ int orte_daemon_process_commands(orte_process_name_t* sender, n = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &command, &n, ORTE_DAEMON_CMD))) { ORTE_ERROR_LOG(ret); - return ret; + return; } cmd_str = get_orted_comm_cmd_str(command); @@ -415,7 +151,6 @@ int orte_daemon_process_commands(orte_process_name_t* sender, proct = OBJ_NEW(orte_proc_t); proct->name.jobid = proc.jobid; proct->name.vpid = proc.vpid; - ORTE_EPOCH_SET(proct->name.epoch,proc.epoch); opal_pointer_array_add(&procarray, proct); num_replies++; @@ -680,11 +415,10 @@ int orte_daemon_process_commands(orte_process_name_t* sender, } } else { /* just deliver it to ourselves */ - if ((ret = orte_rml.send_buffer(ORTE_PROC_MY_NAME, relay_msg, target_tag, 0)) < 0) { + if ((ret = orte_rml.send_buffer_nb(ORTE_PROC_MY_NAME, relay_msg, target_tag, 0, + orte_rml_send_callback, NULL)) < 0) { ORTE_ERROR_LOG(ret); - } else { - ret = ORTE_SUCCESS; - opal_progress(); /* give us a chance to move the message along */ + OBJ_RELEASE(relay_msg); } } } else { @@ -692,49 +426,10 @@ int orte_daemon_process_commands(orte_process_name_t* sender, if (ORTE_SUCCESS != (ret = orte_odls.deliver_message(job, relay_msg, target_tag))) { ORTE_ERROR_LOG(ret); } + OBJ_RELEASE(relay_msg); } - OBJ_RELEASE(relay_msg); break; - /**** WAITPID_FIRED COMMAND ****/ - case ORTE_DAEMON_WAITPID_FIRED: - if (orte_debug_daemons_flag) { - opal_output(0, "%s orted_cmd: received waitpid_fired cmd", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - } - /* unpack the name of the proc that terminated */ - n = 1; - if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &proc, &n, ORTE_NAME))) { - ORTE_ERROR_LOG(ret); - goto CLEANUP; - } - /* unpack the termination status */ - n = 1; - if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &status, &n, OPAL_INT32))) { - ORTE_ERROR_LOG(ret); - goto CLEANUP; - } - /* pass it down for processing */ - orte_base_default_waitpid_fired(&proc, status); - break; - - - /**** IOF_COMPLETE COMMAND ****/ - case ORTE_DAEMON_IOF_COMPLETE: - if (orte_debug_daemons_flag) { - opal_output(0, "%s orted_cmd: received iof_complete cmd", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - } - /* unpack the name of the proc that completed */ - n = 1; - if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &proc, &n, ORTE_NAME))) { - ORTE_ERROR_LOG(ret); - goto CLEANUP; - } - /* pass it down for processing */ - orte_odls_base_notify_iof_complete(&proc); - break; - /**** EXIT COMMAND ****/ case ORTE_DAEMON_EXIT_CMD: if (orte_debug_daemons_flag) { @@ -743,17 +438,23 @@ int orte_daemon_process_commands(orte_process_name_t* sender, } /* kill the local procs */ orte_odls.kill_local_procs(NULL); - /* if all my routes are gone, then terminate ourselves */ - if (0 == orte_routed.num_routes() && - 0 == opal_list_get_size(&orte_local_children)) { + /* if all my routes and local children are gone, then terminate ourselves */ + if (0 == orte_routed.num_routes()) { + for (i=0; i < orte_local_children->size; i++) { + if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) && + proct->alive) { + /* at least one is still alive */ + return; + } + } /* call our appropriate exit procedure */ if (orte_debug_daemons_flag) { opal_output(0, "%s orted_cmd: all routes and children gone - exiting", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); } - orte_quit(); + ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED); } - return ORTE_SUCCESS; + return; break; /**** HALT VM COMMAND ****/ @@ -765,8 +466,8 @@ int orte_daemon_process_commands(orte_process_name_t* sender, /* kill the local procs */ orte_odls.kill_local_procs(NULL); /* call our appropriate exit procedure */ - orte_quit(); - return ORTE_SUCCESS; + ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED); + return; break; /**** SPAWN JOB COMMAND ****/ @@ -801,11 +502,11 @@ int orte_daemon_process_commands(orte_process_name_t* sender, goto CLEANUP; } /* return response */ - if (ORTE_SUCCESS != (ret = orte_comm(sender, answer, - ORTE_RML_TAG_TOOL, NULL))) { + if (0 > (ret = orte_rml.send_buffer_nb(sender, answer, ORTE_RML_TAG_TOOL, 0, + orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(ret); + OBJ_RELEASE(answer); } - OBJ_RELEASE(answer); break; /**** CONTACT QUERY COMMAND ****/ @@ -831,10 +532,11 @@ int orte_daemon_process_commands(orte_process_name_t* sender, goto CLEANUP; } - if (ORTE_SUCCESS != (ret = orte_comm(sender, answer, tag, NULL))) { + if (0 > (ret = orte_rml.send_buffer_nb(sender, answer, ORTE_RML_TAG_TOOL, 0, + orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(ret); + OBJ_RELEASE(answer); } - OBJ_RELEASE(answer); break; /**** REPORT_JOB_INFO_CMD COMMAND ****/ @@ -855,10 +557,11 @@ int orte_daemon_process_commands(orte_process_name_t* sender, OBJ_RELEASE(answer); goto CLEANUP; } - if (ORTE_SUCCESS != (ret = orte_comm(sender, answer, ORTE_RML_TAG_TOOL, NULL))) { + if (0 > (ret = orte_rml.send_buffer_nb(sender, answer, ORTE_RML_TAG_TOOL, 0, + orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(ret); + OBJ_RELEASE(answer); } - OBJ_RELEASE(answer); } else { /* if we are the HNP, process the request */ int32_t i, num_jobs; @@ -927,10 +630,11 @@ int orte_daemon_process_commands(orte_process_name_t* sender, } } } - if (ORTE_SUCCESS != (ret = orte_comm(sender, answer, ORTE_RML_TAG_TOOL, NULL))) { + if (0 > (ret = orte_rml.send_buffer_nb(sender, answer, ORTE_RML_TAG_TOOL, 0, + orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(ret); + OBJ_RELEASE(answer); } - OBJ_RELEASE(answer); } break; @@ -952,10 +656,11 @@ int orte_daemon_process_commands(orte_process_name_t* sender, OBJ_RELEASE(answer); goto CLEANUP; } - if (ORTE_SUCCESS != (ret = orte_comm(sender, answer, ORTE_RML_TAG_TOOL, NULL))) { + if (0 > (ret = orte_rml.send_buffer_nb(sender, answer, ORTE_RML_TAG_TOOL, 0, + orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(ret); + OBJ_RELEASE(answer); } - OBJ_RELEASE(answer); } else { /* if we are the HNP, process the request */ int32_t i, num_nodes; @@ -1020,10 +725,11 @@ int orte_daemon_process_commands(orte_process_name_t* sender, } } /* send the info */ - if (ORTE_SUCCESS != (ret = orte_comm(sender, answer, ORTE_RML_TAG_TOOL, NULL))) { + if (0 > (ret = orte_rml.send_buffer_nb(sender, answer, ORTE_RML_TAG_TOOL, 0, + orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(ret); + OBJ_RELEASE(answer); } - OBJ_RELEASE(answer); } break; @@ -1045,18 +751,16 @@ int orte_daemon_process_commands(orte_process_name_t* sender, OBJ_RELEASE(answer); goto CLEANUP; } - if (ORTE_SUCCESS != (ret = orte_comm(sender, answer, ORTE_RML_TAG_TOOL, NULL))) { + if (0 > (ret = orte_rml.send_buffer_nb(sender, answer, ORTE_RML_TAG_TOOL, 0, + orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(ret); + OBJ_RELEASE(answer); } - OBJ_RELEASE(answer); } else { /* if we are the HNP, process the request */ orte_job_t *jdata; orte_proc_t *proc; orte_vpid_t vpid; -#if ORTE_ENABLE_EPOCH - orte_epoch_t epoch; -#endif int32_t i, num_procs; /* setup the answer */ @@ -1083,14 +787,6 @@ int orte_daemon_process_commands(orte_process_name_t* sender, goto CLEANUP; } -#if ORTE_ENABLE_EPOCH - /* unpack the epoch */ - n = 1; - if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &epoch, &n, ORTE_EPOCH))) { - ORTE_ERROR_LOG(ret); - goto CLEANUP; - } -#endif /* if they asked for a specific proc, then just get that info */ if (ORTE_VPID_WILDCARD != vpid) { @@ -1138,10 +834,11 @@ int orte_daemon_process_commands(orte_process_name_t* sender, } } /* send the info */ - if (ORTE_SUCCESS != (ret = orte_comm(sender, answer, ORTE_RML_TAG_TOOL, NULL))) { + if (0 > (ret = orte_rml.send_buffer_nb(sender, answer, ORTE_RML_TAG_TOOL, 0, + orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(ret); + OBJ_RELEASE(answer); } - OBJ_RELEASE(answer); } break; @@ -1200,7 +897,6 @@ int orte_daemon_process_commands(orte_process_name_t* sender, /* loop across all daemons */ proc2.jobid = ORTE_PROC_MY_NAME->jobid; for (proc2.vpid=1; proc2.vpid < orte_process_info.num_procs; proc2.vpid++) { - ORTE_EPOCH_SET(proc2.epoch,orte_util_lookup_epoch(&proc2)); /* setup the cmd */ relay_msg = OBJ_NEW(opal_buffer_t); @@ -1333,19 +1029,19 @@ int orte_daemon_process_commands(orte_process_name_t* sender, ret = ORTE_ERR_COMM_FAILURE; break; } - if (ORTE_SUCCESS != (ret = orte_comm(return_addr, answer, ORTE_RML_TAG_TOOL, NULL))) { + if (0 > (ret = orte_rml.send_buffer_nb(return_addr, answer, ORTE_RML_TAG_TOOL, 0, + orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(ret); + OBJ_RELEASE(answer); } - OBJ_RELEASE(answer); break; default: ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); - ret = ORTE_ERR_BAD_PARAM; } CLEANUP: - return ret; + return; } static char *get_orted_comm_cmd_str(int command) @@ -1363,11 +1059,7 @@ static char *get_orted_comm_cmd_str(int command) return strdup("ORTE_DAEMON_TREE_SPAWN"); case ORTE_DAEMON_MESSAGE_LOCAL_PROCS: return strdup("ORTE_DAEMON_MESSAGE_LOCAL_PROCS"); - case ORTE_DAEMON_WAITPID_FIRED: - return strdup("ORTE_DAEMON_WAITPID_FIRED"); - case ORTE_DAEMON_IOF_COMPLETE: - return strdup("ORTE_DAEMON_IOF_COMPLETE"); - case ORTE_DAEMON_EXIT_CMD: + case ORTE_DAEMON_EXIT_CMD: return strdup("ORTE_DAEMON_EXIT_CMD"); case ORTE_DAEMON_HALT_VM_CMD: return strdup("ORTE_DAEMON_HALT_VM_CMD"); diff --git a/orte/orted/orted_main.c b/orte/orted/orted_main.c index a62cc0638c..b7d9a85aa6 100644 --- a/orte/orted/orted_main.c +++ b/orte/orted/orted_main.c @@ -10,7 +10,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2007-2011 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2007 Los Alamos National Security, LLC. All rights + * Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2009 Institut National de Recherche en Informatique * et Automatique. All rights reserved. @@ -70,6 +70,7 @@ #include "orte/mca/rml/base/rml_contact.h" #include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/ess/ess.h" #include "orte/mca/grpcomm/grpcomm.h" #include "orte/mca/rml/rml.h" #include "orte/mca/rml/rml_types.h" @@ -79,6 +80,7 @@ #include "orte/mca/ras/ras.h" #include "orte/mca/routed/routed.h" #include "orte/mca/rmaps/rmaps_types.h" +#include "orte/mca/state/state.h" /* need access to the create_jobid fn used by plm components * so we can set singleton name, if necessary @@ -95,8 +97,9 @@ /* * Globals */ -static opal_event_t pipe_handler; +static opal_event_t *pipe_handler; static void shutdown_callback(int fd, short flags, void *arg); +static void pipe_closed(int fd, short flags, void *arg); static struct { bool debug; @@ -224,10 +227,6 @@ int orte_daemon(int argc, char *argv[]) opal_buffer_t *buffer; char hostname[100]; char *tmp_env_var = NULL; - struct timeval starttime, setuptime; - - /* get our time for first executable */ - gettimeofday(&starttime, NULL); /* initialize the globals */ memset(&orted_globals, 0, sizeof(orted_globals)); @@ -364,7 +363,7 @@ int orte_daemon(int argc, char *argv[]) * and have it kill us */ if (0 < orted_globals.fail_delay) { - ORTE_TIMER_EVENT(orted_globals.fail_delay, 0, shutdown_callback); + ORTE_TIMER_EVENT(orted_globals.fail_delay, 0, shutdown_callback, ORTE_SYS_PRI); } else { opal_output(0, "%s is executing clean %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), @@ -403,19 +402,17 @@ int orte_daemon(int argc, char *argv[]) orte_process_info.my_daemon_uri = orte_rml.get_contact_info(); ORTE_PROC_MY_DAEMON->jobid = ORTE_PROC_MY_NAME->jobid; ORTE_PROC_MY_DAEMON->vpid = ORTE_PROC_MY_NAME->vpid; - ORTE_EPOCH_SET(ORTE_PROC_MY_DAEMON->epoch,ORTE_EPOCH_MIN); /* if I am also the hnp, then update that contact info field too */ if (ORTE_PROC_IS_HNP) { orte_process_info.my_hnp_uri = orte_rml.get_contact_info(); ORTE_PROC_MY_HNP->jobid = ORTE_PROC_MY_NAME->jobid; ORTE_PROC_MY_HNP->vpid = ORTE_PROC_MY_NAME->vpid; - ORTE_EPOCH_SET(ORTE_PROC_MY_HNP->epoch,ORTE_EPOCH_MIN); } /* setup the primary daemon command receive function */ ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON, - ORTE_RML_NON_PERSISTENT, orte_daemon_recv, NULL); + ORTE_RML_PERSISTENT, orte_daemon_recv, NULL); if (ret != ORTE_SUCCESS && ret != ORTE_ERR_NOT_IMPLEMENTED) { ORTE_ERROR_LOG(ret); goto DONE; @@ -464,11 +461,14 @@ int orte_daemon(int argc, char *argv[]) if (orted_globals.uri_pipe > 0) { orte_job_t *jdata; orte_proc_t *proc; - orte_node_t **nodes; + orte_node_t *node; orte_app_context_t *app; char *tmp, *nptr, *sysinfo; - int rc; - int32_t ljob; + int32_t ljob, one32; + orte_vpid_t vpid1; + orte_local_rank_t lrank; + orte_node_rank_t nrank; + opal_byte_object_t *bo; /* setup the singleton's job */ jdata = OBJ_NEW(orte_job_t); @@ -489,18 +489,18 @@ int orte_daemon(int argc, char *argv[]) app->num_procs = 1; opal_pointer_array_add(jdata->apps, app); +#if 0 /* run our local allocator to read the available * allocation in case this singleton decides to * comm_spawn other procs */ - if (ORTE_SUCCESS != (rc = orte_ras.allocate(jdata))) { - ORTE_ERROR_LOG(rc); + if (ORTE_SUCCESS != (ret = orte_ras.allocate(jdata))) { + ORTE_ERROR_LOG(ret); /* don't quit as this would cause the singleton * to hang! */ } - - nodes = (orte_node_t**)orte_node_pool->addr; +#endif /* setup a proc object for the singleton - since we * -must- be the HNP, and therefore we stored our @@ -510,15 +510,73 @@ int orte_daemon(int argc, char *argv[]) proc = OBJ_NEW(orte_proc_t); proc->name.jobid = jdata->jobid; proc->name.vpid = 0; - ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN); proc->state = ORTE_PROC_STATE_RUNNING; proc->app_idx = 0; - proc->node = nodes[0]; /* hnp node must be there */ - OBJ_RETAIN(nodes[0]); /* keep accounting straight */ + /* obviously, they are on my node */ + node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0); + proc->node = node; + OBJ_RETAIN(node); /* keep accounting straight */ opal_pointer_array_add(jdata->procs, proc); jdata->num_procs = 1; + jdata->num_local_procs = 1; + /* need to setup a pidmap for it */ + buffer = OBJ_NEW(opal_buffer_t); + opal_dss.pack(buffer, &jdata->jobid, 1, ORTE_JOBID); /* jobid */ + vpid1 = 1; + opal_dss.pack(buffer, &vpid1, 1, ORTE_VPID); /* num_procs */ +#if OPAL_HAVE_HWLOC + { + opal_hwloc_level_t bind_level; + bind_level = OPAL_HWLOC_NODE_LEVEL; + opal_dss.pack(buffer, &bind_level, 1, OPAL_HWLOC_LEVEL_T); /* num_procs */ + } +#endif + one32 = 0; + opal_dss.pack(buffer, &one32, 1, OPAL_INT32); /* node index */ + lrank = 0; + opal_dss.pack(buffer, &lrank, 1, ORTE_LOCAL_RANK); /* local rank */ + nrank = 0; + opal_dss.pack(buffer, &nrank, 1, ORTE_NODE_RANK); /* node rank */ +#if OPAL_HAVE_HWLOC + { + uint bind_idx; + bind_idx = 0; + opal_dss.pack(buffer, &bind_idx, 1, OPAL_UINT); + } +#endif + /* setup a byte object and unload the packed data to it */ + bo = (opal_byte_object_t*)malloc(sizeof(opal_byte_object_t)); + opal_dss.unload(buffer, (void**)&bo->bytes, &bo->size); + OBJ_RELEASE(buffer); + /* save a copy to send back to the proc */ + opal_dss.copy((void**)&jdata->pmap, bo, OPAL_BYTE_OBJECT); + /* update our ess data - this will release the byte object's data */ + if (ORTE_SUCCESS != (ret = orte_ess.update_pidmap(bo))) { + ORTE_ERROR_LOG(ret); + } + free(bo); + + /* if we don't yet have a daemon map, then we have to generate one + * to pass back to it + */ + if (NULL == orte_odls_globals.dmap) { + orte_odls_globals.dmap = (opal_byte_object_t*)malloc(sizeof(opal_byte_object_t)); + /* construct a nodemap */ + if (ORTE_SUCCESS != (ret = orte_util_encode_nodemap(orte_odls_globals.dmap))) { + ORTE_ERROR_LOG(ret); + } + /* we also need to update our local nidmap - copy the dmap + * as this will release the byte object's data. The copy function + * will automatically malloc the bo itself, so we don't need to do so here + */ + opal_dss.copy((void**)&bo, orte_odls_globals.dmap, OPAL_BYTE_OBJECT); + if (ORTE_SUCCESS != (ret = orte_ess.update_nidmap(bo))) { + ORTE_ERROR_LOG(ret); + } + } + /* create a string that contains our uri + the singleton's name + sysinfo */ orte_util_convert_process_name_to_string(&nptr, &proc->name); orte_util_convert_sysinfo_to_string(&sysinfo, orte_local_cpu_type, orte_local_cpu_model); @@ -540,12 +598,13 @@ int orte_daemon(int argc, char *argv[]) /* if we were given a pipe to monitor for singleton termination, set that up */ if (orted_globals.singleton_died_pipe > 0) { /* register shutdown handler */ - opal_event_set(opal_event_base, &pipe_handler, + pipe_handler = (opal_event_t*)malloc(sizeof(opal_event_t)); + opal_event_set(orte_event_base, pipe_handler, orted_globals.singleton_died_pipe, - OPAL_EV_READ|OPAL_EV_PERSIST, - shutdown_callback, + OPAL_EV_READ, + pipe_closed, &orted_globals.singleton_died_pipe); - opal_event_add(&pipe_handler, NULL); + opal_event_add(pipe_handler, NULL); } /* If I have a parent, then save his contact info so @@ -608,60 +667,10 @@ int orte_daemon(int argc, char *argv[]) OBJ_RELEASE(buffer); goto DONE; } - if (orte_timing) { - int64_t secs, usecs; - /* add our start time */ - secs = starttime.tv_sec; - if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &secs, 1, OPAL_INT64))) { - ORTE_ERROR_LOG(ret); - OBJ_RELEASE(buffer); - goto DONE; - } - usecs = starttime.tv_usec; - if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &usecs, 1, OPAL_INT64))) { - ORTE_ERROR_LOG(ret); - OBJ_RELEASE(buffer); - goto DONE; - } - /* get and send our setup time */ - gettimeofday(&setuptime, NULL); - secs = setuptime.tv_sec - starttime.tv_sec; - if (starttime.tv_usec <= setuptime.tv_usec) { - usecs = setuptime.tv_usec - starttime.tv_usec; - } else { - secs--; - usecs = 1000000 - starttime.tv_usec + setuptime.tv_usec; - } - if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &secs, 1, OPAL_INT64))) { - ORTE_ERROR_LOG(ret); - OBJ_RELEASE(buffer); - goto DONE; - } - if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &usecs, 1, OPAL_INT64))) { - ORTE_ERROR_LOG(ret); - OBJ_RELEASE(buffer); - goto DONE; - } - /* include the actual timestamp so the HNP can figure out how - * long it took for this message to arrive - */ - secs = setuptime.tv_sec; - if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &secs, 1, OPAL_INT64))) { - ORTE_ERROR_LOG(ret); - OBJ_RELEASE(buffer); - goto DONE; - } - usecs = setuptime.tv_usec; - if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &usecs, 1, OPAL_INT64))) { - ORTE_ERROR_LOG(ret); - OBJ_RELEASE(buffer); - goto DONE; - } - } /* include our node name */ opal_dss.pack(buffer, &orte_process_info.nodename, 1, OPAL_STRING); - + #if OPAL_HAVE_HWLOC /* add the local topology */ if (NULL != opal_hwloc_topology && @@ -688,25 +697,40 @@ int orte_daemon(int argc, char *argv[]) opal_output(0, "%s orted: up and running - waiting for commands!", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); } - /* wait to hear we are done */ - opal_event_dispatch(opal_event_base); + /* loop the event lib until an exit event is detected */ + while (orte_event_base_active) { + opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE); + } - /* should never get here, but if we do... */ DONE: - /* Finalize and clean up ourselves */ - orte_quit(); - return ret; + /* update the exit status, in case it wasn't done */ + ORTE_UPDATE_EXIT_STATUS(orte_exit_status); + + /* cleanup and leave */ + orte_finalize(); + + if (orte_debug_flag) { + fprintf(stderr, "exiting with status %d\n", orte_exit_status); + } + exit(orte_exit_status); +} + +static void pipe_closed(int fd, short flags, void *arg) +{ + opal_event_t *ev = (opal_event_t*)arg; + + /* no error here - we just want to terminate */ + opal_event_free(ev); + ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED); } static void shutdown_callback(int fd, short flags, void *arg) { - if (NULL != arg) { - /* it's the singleton pipe... remove that handler */ - opal_event_del(&pipe_handler); - } - - if (orte_debug_daemons_flag) { - opal_output(0, "%s orted: finalizing", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); + orte_timer_t *tm = (orte_timer_t*)arg; + + if (NULL != tm) { + /* release the timer */ + OBJ_RELEASE(tm); } /* if we were ordered to abort, do so */ @@ -719,16 +743,13 @@ static void shutdown_callback(int fd, short flags, void *arg) orte_odls.kill_local_procs(NULL); orte_session_dir_cleanup(ORTE_JOBID_WILDCARD); abort(); - } else if ((int)ORTE_PROC_MY_NAME->vpid == orted_globals.fail) { - opal_output(0, "%s is executing clean abnormal termination", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - /* do -not- call finalize as this will send a message to the HNP - * indicating clean termination! Instead, just forcibly cleanup - * the local session_dir tree and exit - */ - orte_odls.kill_local_procs(NULL); - orte_session_dir_cleanup(ORTE_JOBID_WILDCARD); - exit(ORTE_ERROR_DEFAULT_EXIT_CODE); } - - orte_quit(); + opal_output(0, "%s is executing clean abnormal termination", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); + /* do -not- call finalize as this will send a message to the HNP + * indicating clean termination! Instead, just forcibly cleanup + * the local session_dir tree and exit + */ + orte_odls.kill_local_procs(NULL); + orte_session_dir_cleanup(ORTE_JOBID_WILDCARD); + exit(ORTE_ERROR_DEFAULT_EXIT_CODE); } diff --git a/orte/runtime/data_type_support/orte_dt_compare_fns.c b/orte/runtime/data_type_support/orte_dt_compare_fns.c index d3c1186c59..1fbbf3be7e 100644 --- a/orte/runtime/data_type_support/orte_dt_compare_fns.c +++ b/orte/runtime/data_type_support/orte_dt_compare_fns.c @@ -7,6 +7,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -76,20 +78,6 @@ int orte_dt_compare_name(orte_process_name_t *value1, } } -#if ORTE_ENABLE_EPOCH - /** check the epochs - if one of them is WILDCARD, then ignore - * this field since anything is okay - */ - if (value1->epoch!= ORTE_EPOCH_WILDCARD && - value2->epoch!= ORTE_EPOCH_WILDCARD) { - if (value1->epoch < value2->epoch) { - return OPAL_VALUE2_GREATER; - } else if (value1->epoch > value2->epoch) { - return OPAL_VALUE1_GREATER; - } - } -#endif - /** only way to get here is if all fields are equal or WILDCARD */ return OPAL_EQUAL; } @@ -124,23 +112,6 @@ int orte_dt_compare_jobid(orte_jobid_t *value1, return OPAL_EQUAL; } -#if ORTE_ENABLE_EPOCH -int orte_dt_compare_epoch(orte_epoch_t *value1, - orte_epoch_t *value2, - opal_data_type_t type) -{ - /** if either value is WILDCARD, then return equal */ - if (*value1 == ORTE_EPOCH_WILDCARD || - *value2 == ORTE_EPOCH_WILDCARD) return OPAL_EQUAL; - - if (*value1 > *value2) return OPAL_VALUE1_GREATER; - - if (*value2 > *value1) return OPAL_VALUE2_GREATER; - - return OPAL_EQUAL; -} -#endif - #if !ORTE_DISABLE_FULL_SUPPORT /** * JOB @@ -281,16 +252,6 @@ int orte_dt_compare_daemon_cmd(orte_daemon_cmd_flag_t *value1, orte_daemon_cmd_f return OPAL_EQUAL; } -/* ORTE_GRPCOMM_MODE */ -int orte_dt_compare_grpcomm_mode(orte_grpcomm_mode_t *value1, orte_grpcomm_mode_t *value2, opal_data_type_t type) -{ - if (*value1 > *value2) return OPAL_VALUE1_GREATER; - - if (*value2 > *value1) return OPAL_VALUE2_GREATER; - - return OPAL_EQUAL; -} - /* ORTE_IOF_TAG */ int orte_dt_compare_iof_tag(orte_iof_tag_t *value1, orte_iof_tag_t *value2, opal_data_type_t type) { diff --git a/orte/runtime/data_type_support/orte_dt_copy_fns.c b/orte/runtime/data_type_support/orte_dt_copy_fns.c index a5a3a6eda2..4d3bcede49 100644 --- a/orte/runtime/data_type_support/orte_dt_copy_fns.c +++ b/orte/runtime/data_type_support/orte_dt_copy_fns.c @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -61,7 +63,6 @@ int orte_dt_copy_name(orte_process_name_t **dest, orte_process_name_t *src, opal val->jobid = src->jobid; val->vpid = src->vpid; - ORTE_EPOCH_SET(val->epoch,src->epoch); *dest = val; return ORTE_SUCCESS; @@ -105,27 +106,6 @@ int orte_dt_copy_vpid(orte_vpid_t **dest, orte_vpid_t *src, opal_data_type_t typ return ORTE_SUCCESS; } -#if ORTE_ENABLE_EPOCH -/* - * EPOCH - */ -int orte_dt_copy_epoch(orte_epoch_t **dest, orte_epoch_t *src, opal_data_type_t type) -{ - orte_epoch_t *val; - - val = (orte_epoch_t*)malloc(sizeof(orte_epoch_t)); - if (NULL == val) { - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - return ORTE_ERR_OUT_OF_RESOURCE; - } - - *val = *src; - *dest = val; - - return ORTE_SUCCESS; -} -#endif - #if !ORTE_DISABLE_FULL_SUPPORT /** @@ -382,23 +362,6 @@ int orte_dt_copy_daemon_cmd(orte_daemon_cmd_flag_t **dest, orte_daemon_cmd_flag_ return ORTE_SUCCESS; } -int orte_dt_copy_grpcomm_mode(orte_grpcomm_mode_t **dest, orte_grpcomm_mode_t *src, opal_data_type_t type) -{ - size_t datasize; - - datasize = sizeof(orte_grpcomm_mode_t); - - *dest = (orte_grpcomm_mode_t*)malloc(datasize); - if (NULL == *dest) { - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - return ORTE_ERR_OUT_OF_RESOURCE; - } - - memcpy(*dest, src, datasize); - - return ORTE_SUCCESS; -} - int orte_dt_copy_iof_tag(orte_iof_tag_t **dest, orte_iof_tag_t *src, opal_data_type_t type) { size_t datasize; diff --git a/orte/runtime/data_type_support/orte_dt_packing_fns.c b/orte/runtime/data_type_support/orte_dt_packing_fns.c index dcce98159d..3e3554c6e3 100644 --- a/orte/runtime/data_type_support/orte_dt_packing_fns.c +++ b/orte/runtime/data_type_support/orte_dt_packing_fns.c @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -61,9 +63,6 @@ int orte_dt_pack_name(opal_buffer_t *buffer, const void *src, orte_process_name_t* proc; orte_jobid_t *jobid; orte_vpid_t *vpid; -#if ORTE_ENABLE_EPOCH - orte_epoch_t *epoch; -#endif /* collect all the jobids in a contiguous array */ jobid = (orte_jobid_t*)malloc(num_vals * sizeof(orte_jobid_t)); @@ -105,27 +104,6 @@ int orte_dt_pack_name(opal_buffer_t *buffer, const void *src, } free(vpid); -#if ORTE_ENABLE_EPOCH - /* Collect all the epochs in a contiguous array */ - epoch = (orte_epoch_t *) malloc(num_vals * sizeof(orte_epoch_t)); - if (NULL == epoch) { - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - return ORTE_ERR_OUT_OF_RESOURCE; - } - proc = (orte_process_name_t *) src; - for (i = 0; i < num_vals; i++) { - epoch[i] = proc->epoch; - proc++; - } - /* Now pack them in one shot. */ - if (ORTE_SUCCESS != (rc = orte_dt_pack_epoch(buffer, epoch, num_vals, ORTE_EPOCH))) { - ORTE_ERROR_LOG(rc); - free(epoch); - return rc; - } - free(epoch); -#endif - return ORTE_SUCCESS; } @@ -163,24 +141,6 @@ int orte_dt_pack_vpid(opal_buffer_t *buffer, const void *src, return ret; } -#if ORTE_ENABLE_EPOCH -/* - * EPOCH - */ -int orte_dt_pack_epoch(opal_buffer_t *buffer, const void *src, - int32_t num_vals, opal_data_type_t type) -{ - int ret; - - /* Turn around pack the real type */ - if (ORTE_SUCCESS != (ret = opal_dss_pack_buffer(buffer, src, num_vals, ORTE_EPOCH_T))) { - ORTE_ERROR_LOG(ret); - } - - return ret; -} -#endif - #if !ORTE_DISABLE_FULL_SUPPORT /* * JOB @@ -201,20 +161,6 @@ int orte_dt_pack_job(opal_buffer_t *buffer, const void *src, jobs = (orte_job_t**) src; for (i=0; i < num_vals; i++) { - /* pack the name of this job - may be null */ - if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, - (void*)(&(jobs[i]->name)), 1, OPAL_STRING))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /* pack the name of the instance of the job - may be null */ - if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, - (void*)(&(jobs[i]->instance)), 1, OPAL_STRING))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* pack the jobid */ if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, (void*)(&(jobs[i]->jobid)), 1, ORTE_JOBID))) { @@ -957,22 +903,6 @@ int orte_dt_pack_daemon_cmd(opal_buffer_t *buffer, const void *src, int32_t num_ return ret; } -/* - * ORTE_GRPCOMM_MODE - */ -int orte_dt_pack_grpcomm_mode(opal_buffer_t *buffer, const void *src, int32_t num_vals, - opal_data_type_t type) -{ - int ret; - - /* Turn around and pack the real type */ - if (ORTE_SUCCESS != (ret = opal_dss_pack_buffer(buffer, src, num_vals, ORTE_GRPCOMM_MODE_T))) { - ORTE_ERROR_LOG(ret); - } - - return ret; -} - /* * ORTE_IOF_TAG */ diff --git a/orte/runtime/data_type_support/orte_dt_print_fns.c b/orte/runtime/data_type_support/orte_dt_print_fns.c index fe0cc9539b..045c85109f 100644 --- a/orte/runtime/data_type_support/orte_dt_print_fns.c +++ b/orte/runtime/data_type_support/orte_dt_print_fns.c @@ -11,6 +11,8 @@ * All rights reserved. * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -125,11 +127,6 @@ int orte_dt_std_print(char **output, char *prefix, void *src, opal_data_type_t t orte_dt_quick_print(output, "ORTE_STD_CNTR", prefix, src, ORTE_STD_CNTR_T); break; -#if ORTE_ENABLE_EPOCH - case ORTE_EPOCH: - orte_dt_quick_print(output, "ORTE_EPOCH", prefix, src, ORTE_EPOCH_T); -#endif - case ORTE_VPID: orte_dt_quick_print(output, "ORTE_VPID", prefix, src, ORTE_VPID_T); break; @@ -165,10 +162,6 @@ int orte_dt_std_print(char **output, char *prefix, void *src, opal_data_type_t t orte_dt_quick_print(output, "ORTE_DAEMON_CMD", prefix, src, ORTE_DAEMON_CMD_T); break; - case ORTE_GRPCOMM_MODE: - orte_dt_quick_print(output, "ORTE_GRPCOMM_MODE", prefix, src, ORTE_GRPCOMM_MODE_T); - break; - case ORTE_IOF_TAG: orte_dt_quick_print(output, "ORTE_IOF_TAG", prefix, src, ORTE_IOF_TAG_T); break; @@ -224,10 +217,8 @@ int orte_dt_print_job(char **output, char *prefix, orte_job_t *src, opal_data_ty asprintf(&pfx2, "%s", prefix); } - asprintf(&tmp, "\n%sData for job: %s\tName: %s\tInstance: %s\tRecovery: %s(%s)\n%s\tNum apps: %ld\tControls: %0x\tStdin target: %s\tState: %s\tAbort: %s", pfx2, + asprintf(&tmp, "\n%sData for job: %s\tRecovery: %s(%s)\n%s\tNum apps: %ld\tControls: %0x\tStdin target: %s\tState: %s\tAbort: %s", pfx2, ORTE_JOBID_PRINT(src->jobid), - (NULL != src->name) ? src->name : "NULL", - (NULL != src->instance) ? src->instance : "NULL", (src->enable_recovery) ? "ENABLED" : "DISABLED", (src->recovery_defined) ? "DEFINED" : "DEFAULT", pfx2, @@ -471,21 +462,11 @@ int orte_dt_print_proc(char **output, char *prefix, orte_proc_t *src, opal_data_ if (orte_xml_output) { /* need to create the output in XML format */ if (0 == src->pid) { -#if ORTE_ENABLE_EPOCH - asprintf(output, "%s\n", pfx2, - ORTE_VPID_PRINT(src->name.vpid), orte_proc_state_to_str(src->state), ORTE_EPOCH_PRINT(src->name.epoch)); -#else asprintf(output, "%s\n", pfx2, ORTE_VPID_PRINT(src->name.vpid), orte_proc_state_to_str(src->state)); -#endif } else { -#if ORTE_ENABLE_EPOCH - asprintf(output, "%s\n", pfx2, - ORTE_VPID_PRINT(src->name.vpid), (int)src->pid, orte_proc_state_to_str(src->state), ORTE_EPOCH_PRINT(src->name.epoch)); -#else asprintf(output, "%s\n", pfx2, ORTE_VPID_PRINT(src->name.vpid), (int)src->pid, orte_proc_state_to_str(src->state)); -#endif } free(pfx2); return ORTE_SUCCESS; @@ -493,16 +474,9 @@ int orte_dt_print_proc(char **output, char *prefix, orte_proc_t *src, opal_data_ if (!orte_devel_level_output) { /* just print a very simple output for users */ -#if ORTE_ENABLE_EPOCH - asprintf(&tmp, "\n%sProcess OMPI jobid: %s App: %ld Process rank: %s Epoch: %s", pfx2, - ORTE_JOBID_PRINT(src->name.jobid), (long)src->app_idx, - ORTE_VPID_PRINT(src->name.vpid), - ORTE_EPOCH_PRINT(src->name.epoch)); -#else asprintf(&tmp, "\n%sProcess OMPI jobid: %s App: %ld Process rank: %s", pfx2, ORTE_JOBID_PRINT(src->name.jobid), (long)src->app_idx, ORTE_VPID_PRINT(src->name.vpid)); -#endif /* set the return */ *output = tmp; diff --git a/orte/runtime/data_type_support/orte_dt_size_fns.c b/orte/runtime/data_type_support/orte_dt_size_fns.c index 01228a4001..fe13979f8e 100644 --- a/orte/runtime/data_type_support/orte_dt_size_fns.c +++ b/orte/runtime/data_type_support/orte_dt_size_fns.c @@ -8,6 +8,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -45,12 +47,6 @@ int orte_dt_std_size(size_t *size, void *src, opal_data_type_t type) *size = sizeof(orte_std_cntr_t); break; -#if ORTE_ENABLE_EPOCH - case ORTE_EPOCH: - *size = sizeof(orte_epoch_t); - break; -#endif - case ORTE_VPID: *size = sizeof(orte_vpid_t); break; @@ -88,10 +84,6 @@ int orte_dt_std_size(size_t *size, void *src, opal_data_type_t type) *size = sizeof(orte_rml_tag_t); break; - case ORTE_GRPCOMM_MODE: - *size = sizeof(orte_grpcomm_mode_t); - break; - case ORTE_IOF_TAG: *size = sizeof(orte_iof_tag_t); break; diff --git a/orte/runtime/data_type_support/orte_dt_support.h b/orte/runtime/data_type_support/orte_dt_support.h index f58855a6f1..f2f53b8dc5 100644 --- a/orte/runtime/data_type_support/orte_dt_support.h +++ b/orte/runtime/data_type_support/orte_dt_support.h @@ -9,6 +9,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -29,7 +31,6 @@ #include "orte/types.h" #include "opal/dss/dss_types.h" -#include "orte/mca/grpcomm/grpcomm_types.h" #include "orte/mca/odls/odls_types.h" #include "orte/mca/plm/plm_types.h" #include "orte/mca/rmaps/rmaps_types.h" @@ -52,14 +53,7 @@ int orte_dt_compare_jobid(orte_jobid_t *value1, int orte_dt_compare_vpid(orte_vpid_t *value1, orte_vpid_t *value2, opal_data_type_t type); -#if ORTE_ENABLE_EPOCH -int orte_dt_compare_epoch(orte_epoch_t *value1, - orte_epoch_t *value2, - opal_data_type_t type); -#define ORTE_EPOCH_CMP(n,m) ( (m) - (n) ) -#else -#define ORTE_EPOCH_CMP(n,m) ( 0 ) -#endif + #if !ORTE_DISABLE_FULL_SUPPORT int orte_dt_compare_job(orte_job_t *value1, orte_job_t *value2, opal_data_type_t type); int orte_dt_compare_node(orte_node_t *value1, orte_node_t *value2, opal_data_type_t type); @@ -82,7 +76,6 @@ int orte_dt_compare_tags(orte_rml_tag_t *value1, orte_rml_tag_t *value2, opal_data_type_t type); int orte_dt_compare_daemon_cmd(orte_daemon_cmd_flag_t *value1, orte_daemon_cmd_flag_t *value2, opal_data_type_t type); -int orte_dt_compare_grpcomm_mode(orte_grpcomm_mode_t *value1, orte_grpcomm_mode_t *value2, opal_data_type_t type); int orte_dt_compare_iof_tag(orte_iof_tag_t *value1, orte_iof_tag_t *value2, opal_data_type_t type); #endif @@ -91,9 +84,7 @@ int orte_dt_copy_std_cntr(orte_std_cntr_t **dest, orte_std_cntr_t *src, opal_dat int orte_dt_copy_name(orte_process_name_t **dest, orte_process_name_t *src, opal_data_type_t type); int orte_dt_copy_jobid(orte_jobid_t **dest, orte_jobid_t *src, opal_data_type_t type); int orte_dt_copy_vpid(orte_vpid_t **dest, orte_vpid_t *src, opal_data_type_t type); -#if ORTE_ENABLE_EPOCH -int orte_dt_copy_epoch(orte_epoch_t **dest, orte_epoch_t *src, opal_data_type_t type); -#endif + #if !ORTE_DISABLE_FULL_SUPPORT int orte_dt_copy_job(orte_job_t **dest, orte_job_t *src, opal_data_type_t type); int orte_dt_copy_node(orte_node_t **dest, orte_node_t *src, opal_data_type_t type); @@ -108,7 +99,6 @@ int orte_dt_copy_tag(orte_rml_tag_t **dest, orte_rml_tag_t *src, opal_data_type_t type); int orte_dt_copy_daemon_cmd(orte_daemon_cmd_flag_t **dest, orte_daemon_cmd_flag_t *src, opal_data_type_t type); -int orte_dt_copy_grpcomm_mode(orte_grpcomm_mode_t **dest, orte_grpcomm_mode_t *src, opal_data_type_t type); int orte_dt_copy_iof_tag(orte_iof_tag_t **dest, orte_iof_tag_t *src, opal_data_type_t type); #endif @@ -123,10 +113,7 @@ int orte_dt_pack_jobid(opal_buffer_t *buffer, const void *src, int32_t num_vals, opal_data_type_t type); int orte_dt_pack_vpid(opal_buffer_t *buffer, const void *src, int32_t num_vals, opal_data_type_t type); -#if ORTE_ENABLE_EPOCH -int orte_dt_pack_epoch(opal_buffer_t *buffer, const void *src, - int32_t num_vals, opal_data_type_t type); -#endif + #if !ORTE_DISABLE_FULL_SUPPORT int orte_dt_pack_job(opal_buffer_t *buffer, const void *src, int32_t num_vals, opal_data_type_t type); @@ -152,8 +139,6 @@ int orte_dt_pack_tag(opal_buffer_t *buffer, opal_data_type_t type); int orte_dt_pack_daemon_cmd(opal_buffer_t *buffer, const void *src, int32_t num_vals, opal_data_type_t type); -int orte_dt_pack_grpcomm_mode(opal_buffer_t *buffer, const void *src, - int32_t num_vals, opal_data_type_t type); int orte_dt_pack_iof_tag(opal_buffer_t *buffer, const void *src, int32_t num_vals, opal_data_type_t type); #endif @@ -192,10 +177,7 @@ int orte_dt_unpack_jobid(opal_buffer_t *buffer, void *dest, int32_t *num_vals, opal_data_type_t type); int orte_dt_unpack_vpid(opal_buffer_t *buffer, void *dest, int32_t *num_vals, opal_data_type_t type); -#if ORTE_ENABLE_EPOCH -int orte_dt_unpack_epoch(opal_buffer_t *buffer, void *dest, - int32_t *num_vals, opal_data_type_t type); -#endif + #if !ORTE_DISABLE_FULL_SUPPORT int orte_dt_unpack_job(opal_buffer_t *buffer, void *dest, int32_t *num_vals, opal_data_type_t type); @@ -221,8 +203,6 @@ int orte_dt_unpack_tag(opal_buffer_t *buffer, opal_data_type_t type); int orte_dt_unpack_daemon_cmd(opal_buffer_t *buffer, void *dest, int32_t *num_vals, opal_data_type_t type); -int orte_dt_unpack_grpcomm_mode(opal_buffer_t *buffer, void *dest, - int32_t *num_vals, opal_data_type_t type); int orte_dt_unpack_iof_tag(opal_buffer_t *buffer, void *dest, int32_t *num_vals, opal_data_type_t type); #endif diff --git a/orte/runtime/data_type_support/orte_dt_unpacking_fns.c b/orte/runtime/data_type_support/orte_dt_unpacking_fns.c index ec4dc9906b..7fb89f87c1 100644 --- a/orte/runtime/data_type_support/orte_dt_unpacking_fns.c +++ b/orte/runtime/data_type_support/orte_dt_unpacking_fns.c @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -56,9 +58,6 @@ int orte_dt_unpack_name(opal_buffer_t *buffer, void *dest, orte_process_name_t* proc; orte_jobid_t *jobid; orte_vpid_t *vpid; -#if ORTE_ENABLE_EPOCH - orte_epoch_t *epoch; -#endif num = *num_vals; @@ -96,39 +95,15 @@ int orte_dt_unpack_name(opal_buffer_t *buffer, void *dest, return rc; } -#if ORTE_ENABLE_EPOCH - /* collect all the epochs in a contiguous array */ - epoch= (orte_epoch_t*)malloc(num * sizeof(orte_epoch_t)); - if (NULL == epoch) { - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - *num_vals = 0; - free(jobid); - return ORTE_ERR_OUT_OF_RESOURCE; - } - /* now unpack them in one shot */ - if (ORTE_SUCCESS != (rc = - orte_dt_unpack_epoch(buffer, epoch, num_vals, ORTE_EPOCH))) { - ORTE_ERROR_LOG(rc); - *num_vals = 0; - free(epoch); - free(jobid); - return rc; - } -#endif - - /* build the names from the jobid/vpid/epoch arrays */ + /* build the names from the jobid/vpid arrays */ proc = (orte_process_name_t*)dest; for (i=0; i < num; i++) { proc->jobid = jobid[i]; proc->vpid = vpid[i]; - ORTE_EPOCH_SET(proc->epoch,epoch[i]); proc++; } /* cleanup */ -#if ORTE_ENABLE_EPOCH - free(epoch); -#endif free(vpid); free(jobid); @@ -167,24 +142,6 @@ int orte_dt_unpack_vpid(opal_buffer_t *buffer, void *dest, return ret; } -#if ORTE_ENABLE_EPOCH -/* - * EPOCH - */ -int orte_dt_unpack_epoch(opal_buffer_t *buffer, void *dest, - int32_t *num_vals, opal_data_type_t type) -{ - int ret; - - /* Turn around and unpack the real type */ - if (ORTE_SUCCESS != (ret = opal_dss_unpack_buffer(buffer, dest, num_vals, ORTE_EPOCH_T))) { - ORTE_ERROR_LOG(ret); - } - - return ret; -} -#endif - #if !ORTE_DISABLE_FULL_SUPPORT /* * JOB @@ -213,22 +170,6 @@ int orte_dt_unpack_job(opal_buffer_t *buffer, void *dest, return ORTE_ERR_OUT_OF_RESOURCE; } - /* unpack the name of this job - may be null */ - n = 1; - if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, - &(jobs[i]->name), &n, OPAL_STRING))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /* unpack the instance name of this job - may be null */ - n = 1; - if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, - &(jobs[i]->instance), &n, OPAL_STRING))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* unpack the jobid */ n = 1; if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, @@ -1048,20 +989,6 @@ int orte_dt_unpack_daemon_cmd(opal_buffer_t *buffer, void *dest, int32_t *num_va return ret; } -/* - * ORTE_GRPCOMM_MODE - */ -int orte_dt_unpack_grpcomm_mode(opal_buffer_t *buffer, void *dest, int32_t *num_vals, - opal_data_type_t type) -{ - int ret; - - /* turn around and unpack the real type */ - ret = opal_dss_unpack_buffer(buffer, dest, num_vals, ORTE_GRPCOMM_MODE_T); - - return ret; -} - /* * ORTE_IOF_TAG */ diff --git a/orte/runtime/orte_data_server.c b/orte/runtime/orte_data_server.c index 22190e0031..3a4e7b142d 100644 --- a/orte/runtime/orte_data_server.c +++ b/orte/runtime/orte_data_server.c @@ -96,7 +96,7 @@ int orte_data_server_init(void) if (!recv_issued) { if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DATA_SERVER, - ORTE_RML_NON_PERSISTENT, + ORTE_RML_PERSISTENT, orte_data_server, NULL))) { ORTE_ERROR_LOG(rc); @@ -147,25 +147,36 @@ static orte_data_object_t *lookup(char *service) return NULL; } -static void process_message(int fd, short event, void *evdat) +static void rml_cbfunc(int status, orte_process_name_t* sender, + opal_buffer_t* buffer, orte_rml_tag_t tag, + void* cbdata) +{ + OBJ_RELEASE(buffer); +} + +void orte_data_server(int status, orte_process_name_t* sender, + opal_buffer_t* buffer, orte_rml_tag_t tag, + void* cbdata) { - orte_message_event_t *mev = (orte_message_event_t*)evdat; - orte_process_name_t *sender = &mev->sender; - opal_buffer_t *buffer = mev->buffer; orte_data_server_cmd_t command; orte_std_cntr_t count; char *service_name, *port_name; orte_data_object_t *data; - opal_buffer_t answer; + opal_buffer_t *answer; int rc, ret; count = 1; + + OPAL_OUTPUT_VERBOSE((1, orte_debug_output, + "%s data server got message from %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(sender))); if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &command, &count, ORTE_DATA_SERVER_CMD))) { ORTE_ERROR_LOG(rc); return; } - OBJ_CONSTRUCT(&answer, opal_buffer_t); + answer = OBJ_NEW(opal_buffer_t); switch(command) { case ORTE_DATA_SERVER_PUBLISH: @@ -200,7 +211,7 @@ static void process_message(int fd, short event, void *evdat) service_name, port_name)); ret = ORTE_EXISTS; - if (ORTE_SUCCESS != (rc = opal_dss.pack(&answer, &ret, 1, OPAL_INT))) { + if (ORTE_SUCCESS != (rc = opal_dss.pack(answer, &ret, 1, OPAL_INT))) { ORTE_ERROR_LOG(rc); /* if we can't pack it, we probably can't pack the * rc value either, so just send whatever is there @@ -219,7 +230,6 @@ static void process_message(int fd, short event, void *evdat) data->port = port_name; data->owner.jobid = sender->jobid; data->owner.vpid = sender->vpid; - ORTE_EPOCH_SET(data->owner.epoch,sender->epoch); /* store the data */ data->index = opal_pointer_array_add(orte_data_server_store, data); @@ -231,7 +241,7 @@ static void process_message(int fd, short event, void *evdat) /* tell the user it was wonderful... */ ret = ORTE_SUCCESS; - if (ORTE_SUCCESS != (rc = opal_dss.pack(&answer, &ret, 1, OPAL_INT))) { + if (ORTE_SUCCESS != (rc = opal_dss.pack(answer, &ret, 1, OPAL_INT))) { ORTE_ERROR_LOG(rc); /* if we can't pack it, we probably can't pack the * rc value either, so just send whatever is there @@ -263,7 +273,7 @@ static void process_message(int fd, short event, void *evdat) /* return ORTE_ERR_NOT_FOUND error code */ ret = ORTE_ERR_NOT_FOUND; - if (ORTE_SUCCESS != (rc = opal_dss.pack(&answer, &ret, 1, OPAL_INT))) { + if (ORTE_SUCCESS != (rc = opal_dss.pack(answer, &ret, 1, OPAL_INT))) { ORTE_ERROR_LOG(rc); /* if we can't pack it, we probably can't pack the * rc value either, so just send whatever is there @@ -281,7 +291,7 @@ static void process_message(int fd, short event, void *evdat) * always unpack an int first */ ret = ORTE_SUCCESS; - if (ORTE_SUCCESS != (rc = opal_dss.pack(&answer, &ret, 1, OPAL_INT))) { + if (ORTE_SUCCESS != (rc = opal_dss.pack(answer, &ret, 1, OPAL_INT))) { ORTE_ERROR_LOG(rc); /* if we can't pack it, we probably can't pack the * rc value either, so just send whatever is there @@ -290,7 +300,7 @@ static void process_message(int fd, short event, void *evdat) } /* pack the returned port */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(&answer, &data->port, 1, OPAL_STRING))) { + if (ORTE_SUCCESS != (rc = opal_dss.pack(answer, &data->port, 1, OPAL_STRING))) { ORTE_ERROR_LOG(rc); /* if we can't pack it, we probably can't pack the * rc value either, so just send whatever is there @@ -322,7 +332,7 @@ static void process_message(int fd, short event, void *evdat) /* return ORTE_ERR_NOT_FOUND error code */ ret = ORTE_ERR_NOT_FOUND; - if (ORTE_SUCCESS != (rc = opal_dss.pack(&answer, &ret, 1, OPAL_INT))) { + if (ORTE_SUCCESS != (rc = opal_dss.pack(answer, &ret, 1, OPAL_INT))) { ORTE_ERROR_LOG(rc); /* if we can't pack it, we probably can't pack the * rc value either, so just send whatever is there @@ -342,7 +352,7 @@ static void process_message(int fd, short event, void *evdat) /* nope - return ORTE_ERR_PERM error code */ ret = ORTE_ERR_PERM; - if (ORTE_SUCCESS != (rc = opal_dss.pack(&answer, &ret, 1, OPAL_INT))) { + if (ORTE_SUCCESS != (rc = opal_dss.pack(answer, &ret, 1, OPAL_INT))) { ORTE_ERROR_LOG(rc); /* if we can't pack it, we probably can't pack the * rc value either, so just send whatever is there @@ -362,7 +372,7 @@ static void process_message(int fd, short event, void *evdat) /* tell the sender this succeeded */ ret = ORTE_SUCCESS; - if (ORTE_SUCCESS != (rc = opal_dss.pack(&answer, &ret, 1, OPAL_INT))) { + if (ORTE_SUCCESS != (rc = opal_dss.pack(answer, &ret, 1, OPAL_INT))) { ORTE_ERROR_LOG(rc); /* if we can't pack it, we probably can't pack the * rc value either, so just send whatever is there @@ -379,49 +389,15 @@ static void process_message(int fd, short event, void *evdat) SEND_ERROR: /* pack the error code */ - if (ORTE_SUCCESS != (ret = opal_dss.pack(&answer, &rc, 1, OPAL_INT))) { + if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &rc, 1, OPAL_INT))) { ORTE_ERROR_LOG(ret); } SEND_ANSWER: - if (0 > (rc = orte_rml.send_buffer(sender, &answer, ORTE_RML_TAG_DATA_CLIENT, 0))) { + if (0 > (rc = orte_rml.send_buffer_nb(sender, answer, ORTE_RML_TAG_DATA_CLIENT, 0, rml_cbfunc, NULL))) { ORTE_ERROR_LOG(rc); + OBJ_RELEASE(answer); } - OBJ_DESTRUCT(&answer); - - OBJ_RELEASE(mev); } -void orte_data_server(int status, orte_process_name_t* sender, - opal_buffer_t* buffer, orte_rml_tag_t tag, - void* cbdata) -{ - int rc; - - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, - "%s data server got message from %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(sender))); - - /* don't process this right away - we need to get out of the recv before - * we process the message as it may ask us to do something that involves - * more messaging! Instead, setup an event so that the message gets processed - * as soon as we leave the recv. - * - * The macro makes a copy of the buffer, which we release above - the incoming - * buffer, however, is NOT released here, although its payload IS transferred - * to the message buffer for later processing - */ - ORTE_MESSAGE_EVENT(sender, buffer, tag, process_message); - - /* reissue the recv */ - if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, - ORTE_RML_TAG_DATA_SERVER, - ORTE_RML_NON_PERSISTENT, - orte_data_server, - NULL))) { - ORTE_ERROR_LOG(rc); - } - -} diff --git a/orte/runtime/orte_finalize.c b/orte/runtime/orte_finalize.c index 465cb75a9f..a5d2ba9def 100644 --- a/orte/runtime/orte_finalize.c +++ b/orte/runtime/orte_finalize.c @@ -47,7 +47,7 @@ int orte_finalize(void) if (!orte_initialized) { return ORTE_SUCCESS; } - + /* protect against multiple calls */ if (opal_atomic_trylock(&orte_finalize_lock)) { return ORTE_SUCCESS; @@ -58,7 +58,7 @@ int orte_finalize(void) /* close the orte_show_help system */ orte_show_help_finalize(); - + /* call the finalize function for this environment */ orte_ess.finalize(); diff --git a/orte/runtime/orte_globals.c b/orte/runtime/orte_globals.c index 733f58f93c..fed5dcad34 100644 --- a/orte/runtime/orte_globals.c +++ b/orte/runtime/orte_globals.c @@ -11,8 +11,8 @@ * All rights reserved. * Copyright (c) 2007-2011 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011 Los Alamos National Security, LLC. All rights - * reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -116,6 +116,7 @@ opal_buffer_t *orte_tree_launch_cmd = NULL; opal_pointer_array_t *orte_job_data; opal_pointer_array_t *orte_node_pool; opal_pointer_array_t *orte_node_topologies; +opal_pointer_array_t *orte_local_children; /* a clean output channel without prefix */ int orte_clean_output = -1; @@ -123,17 +124,6 @@ int orte_clean_output = -1; /* Nidmap and job maps */ opal_pointer_array_t orte_nidmap; opal_pointer_array_t orte_jobmap; -char *orted_launch_cmd = NULL; - -/* list of local children on a daemon */ -opal_list_t orte_local_children; -opal_mutex_t orte_local_children_lock; -opal_condition_t orte_local_children_cond; - -/* list of job data for local children on a daemon */ -opal_list_t orte_local_jobdata; -opal_mutex_t orte_local_jobdata_lock; -opal_condition_t orte_local_jobdata_cond; /* IOF controls */ bool orte_tag_output; @@ -180,6 +170,10 @@ bool orte_report_child_jobs_separately; struct timeval orte_child_time_to_exit; bool orte_abort_non_zero_exit; +/* State Machine */ +opal_list_t orte_job_states; +opal_list_t orte_proc_states; + /* length of stat history to keep */ int orte_stat_history_size; @@ -189,6 +183,11 @@ char *orte_forward_envars = NULL; /* preload binaries */ bool orte_preload_binaries = false; +/* progress thread */ +#if ORTE_ENABLE_PROGRESS_THREAD +opal_thread_t orte_progress_thread; +#endif + #endif /* !ORTE_DISABLE_FULL_RTE */ int orte_debug_output = -1; @@ -278,22 +277,6 @@ int orte_dt_init(void) return rc; } -#if ORTE_ENABLE_EPOCH - tmp = ORTE_EPOCH; - if (ORTE_SUCCESS != (rc = opal_dss.register_type(orte_dt_pack_epoch, - orte_dt_unpack_epoch, - (opal_dss_copy_fn_t)orte_dt_copy_epoch, - (opal_dss_compare_fn_t)orte_dt_compare_epoch, - (opal_dss_size_fn_t)orte_dt_std_size, - (opal_dss_print_fn_t)orte_dt_std_print, - (opal_dss_release_fn_t)orte_dt_std_release, - OPAL_DSS_UNSTRUCTURED, - "ORTE_EPOCH", &tmp))) { - ORTE_ERROR_LOG(rc); - return rc; - } -#endif - #if !ORTE_DISABLE_FULL_SUPPORT tmp = ORTE_JOB; if (ORTE_SUCCESS != (rc = opal_dss.register_type(orte_dt_pack_job, @@ -449,20 +432,6 @@ int orte_dt_init(void) return rc; } - tmp = ORTE_GRPCOMM_MODE; - if (ORTE_SUCCESS != (rc = opal_dss.register_type(orte_dt_pack_grpcomm_mode, - orte_dt_unpack_grpcomm_mode, - (opal_dss_copy_fn_t)orte_dt_copy_grpcomm_mode, - (opal_dss_compare_fn_t)orte_dt_compare_grpcomm_mode, - (opal_dss_size_fn_t)orte_dt_std_size, - (opal_dss_print_fn_t)orte_dt_std_print, - (opal_dss_release_fn_t)orte_dt_std_release, - OPAL_DSS_UNSTRUCTURED, - "ORTE_GRPCOMM_MODE", &tmp))) { - ORTE_ERROR_LOG(rc); - return rc; - } - tmp = ORTE_IOF_TAG; if (ORTE_SUCCESS != (rc = opal_dss.register_type(orte_dt_pack_iof_tag, orte_dt_unpack_iof_tag, @@ -535,31 +504,6 @@ orte_vpid_t orte_get_lowest_vpid_alive(orte_jobid_t job) return ORTE_VPID_INVALID; } -int orte_global_comm(orte_process_name_t *recipient, - opal_buffer_t *buf, orte_rml_tag_t tag, - orte_default_cbfunc_t cbfunc) -{ - int ret; - orte_ns_cmp_bitmask_t mask; - - mask = ORTE_NS_CMP_ALL; - - if (OPAL_EQUAL == orte_util_compare_name_fields(mask, recipient, ORTE_PROC_MY_NAME) && - NULL != cbfunc) { - /* if I am the recipient and a direct fn is provided, use a message event */ - ORTE_MESSAGE_EVENT(ORTE_PROC_MY_NAME, buf, tag, cbfunc); - ret = ORTE_SUCCESS; - } else { - /* go ahead and send it */ - if (0 > (ret = orte_rml.send_buffer(recipient, buf, tag, 0))) { - ORTE_ERROR_LOG(ret); - } else { - ret = ORTE_SUCCESS; - } - } - return ret; -} - /* * CONSTRUCTORS, DESTRUCTORS, AND CLASS INSTANTIATIONS * FOR ORTE CLASSES @@ -681,8 +625,6 @@ OBJ_CLASS_INSTANCE(orte_app_context_t, static void orte_job_construct(orte_job_t* job) { - job->name = NULL; - job->instance = NULL; job->jobid = ORTE_JOBID_INVALID; job->apps = OBJ_NEW(opal_pointer_array_t); opal_pointer_array_init(job->apps, @@ -703,26 +645,25 @@ static void orte_job_construct(orte_job_t* job) job->map = NULL; job->bookmark = NULL; job->state = ORTE_JOB_STATE_UNDEF; + job->restart = false; job->num_launched = 0; job->num_reported = 0; job->num_terminated = 0; job->num_daemons_reported = 0; + job->num_non_zero_exit = 0; job->abort = false; job->aborted_proc = NULL; - OBJ_CONSTRUCT(&job->dyn_spawn_lock, opal_mutex_t); - OBJ_CONSTRUCT(&job->dyn_spawn_cond, opal_condition_t); - job->dyn_spawn_active = false; - + job->originator.jobid = ORTE_JOBID_INVALID; + job->originator.vpid = ORTE_VPID_INVALID; + job->recovery_defined = false; job->enable_recovery = false; - - job->launch_msg_sent.tv_sec = 0; - job->launch_msg_sent.tv_usec = 0; - job->max_launch_msg_recvd.tv_sec = 0; - job->max_launch_msg_recvd.tv_usec = 0; - + job->num_local_procs = 0; + + job->pmap = NULL; + #if OPAL_ENABLE_FT_CR == 1 job->ckpt_state = 0; job->ckpt_snapshot_ref = NULL; @@ -736,7 +677,7 @@ static void orte_job_destruct(orte_job_t* job) orte_app_context_t *app; orte_job_t *jdata; int n; - + if (NULL == job) { /* probably just a race condition - just return */ return; @@ -747,14 +688,6 @@ static void orte_job_destruct(orte_job_t* job) ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job->jobid)); } - if (NULL != job->name) { - free(job->name); - } - - if (NULL != job->instance) { - free(job->instance); - } - for (n=0; n < job->apps->size; n++) { if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(job->apps, n))) { continue; @@ -776,9 +709,13 @@ static void orte_job_destruct(orte_job_t* job) } OBJ_RELEASE(job->procs); - OBJ_DESTRUCT(&job->dyn_spawn_lock); - OBJ_DESTRUCT(&job->dyn_spawn_cond); - + if (NULL != job->pmap) { + if (NULL != job->pmap->bytes) { + free(job->pmap->bytes); + } + free(job->pmap); + } + #if OPAL_ENABLE_FT_CR == 1 if (NULL != job->ckpt_snapshot_ref) { free(job->ckpt_snapshot_ref); @@ -907,6 +844,7 @@ static void orte_proc_construct(orte_proc_t* proc) proc->app_rank = -1; proc->last_errmgr_state = ORTE_PROC_STATE_UNDEF; proc->state = ORTE_PROC_STATE_UNDEF; + proc->alive = false; proc->app_idx = 0; #if OPAL_HAVE_HWLOC proc->locale = NULL; @@ -914,6 +852,8 @@ static void orte_proc_construct(orte_proc_t* proc) proc->cpu_bitmap = NULL; #endif proc->node = NULL; + proc->local_proc = false; + proc->do_not_barrier = false; proc->prior_node = NULL; proc->nodename = NULL; proc->exit_code = 0; /* Assume we won't fail unless otherwise notified */ @@ -926,7 +866,10 @@ static void orte_proc_construct(orte_proc_t* proc) proc->beat = 0; OBJ_CONSTRUCT(&proc->stats, opal_ring_buffer_t); opal_ring_buffer_init(&proc->stats, orte_stat_history_size); - ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN); + proc->registered = false; + proc->deregistered = false; + proc->iof_complete = false; + proc->waitpid_recvd = false; #if OPAL_ENABLE_FT_CR == 1 proc->ckpt_state = 0; proc->ckpt_snapshot_ref = NULL; diff --git a/orte/runtime/orte_globals.h b/orte/runtime/orte_globals.h index c2f71ed2f2..e41c7d636a 100644 --- a/orte/runtime/orte_globals.h +++ b/orte/runtime/orte_globals.h @@ -11,8 +11,8 @@ * All rights reserved. * Copyright (c) 2007-2010 Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2007-2011 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2011 Los Alamos National Security, LLC. All rights - * reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -40,6 +40,7 @@ #include "opal/class/opal_value_array.h" #include "opal/class/opal_ring_buffer.h" #include "opal/threads/threads.h" +#include "opal/mca/event/event.h" #include "opal/mca/hwloc/hwloc.h" #include "opal/mca/paffinity/paffinity.h" @@ -47,6 +48,7 @@ #include "orte/mca/rml/rml_types.h" #include "orte/util/proc_info.h" #include "orte/util/name_fns.h" +#include "orte/util/error_strings.h" #include "orte/runtime/runtime.h" #include "orte/runtime/orte_wait.h" @@ -62,6 +64,8 @@ ORTE_DECLSPEC extern char *orte_job_ident; /* instantiated in orte/runtime/orte ORTE_DECLSPEC extern bool orte_create_session_dirs; /* instantiated in orte/runtime/orte_init.c */ ORTE_DECLSPEC extern bool orte_execute_quiet; /* instantiated in orte/runtime/orte_globals.c */ ORTE_DECLSPEC extern bool orte_report_silent_errors; /* instantiated in orte/runtime/orte_globals.c */ +ORTE_DECLSPEC extern opal_event_base_t *orte_event_base; /* instantiated in orte/runtime/orte_init.c */ +ORTE_DECLSPEC extern bool orte_event_base_active; /* Shortcut for some commonly used names */ #define ORTE_NAME_WILDCARD (&orte_name_wildcard) @@ -105,6 +109,23 @@ typedef struct orte_app_context_t orte_app_context_t; #else +#if ORTE_ENABLE_PROGRESS_THREAD +ORTE_DECLSPEC extern opal_thread_t orte_progress_thread; +#endif + +/* ORTE event priorities - we define these + * at levels that permit higher layers such as + * OMPI to handle their events at higher priority, + * with the exception of errors. Errors generally + * require exception handling (e.g., ctrl-c termination) + * that overrides the need to process MPI messages + */ +#define ORTE_ERROR_PRI OPAL_EV_ERROR_PRI +#define ORTE_MSG_PRI OPAL_EV_MSG_LO_PRI +#define ORTE_SYS_PRI OPAL_EV_SYS_LO_PRI +#define ORTE_INFO_PRI OPAL_EV_INFO_LO_PRI + + #define ORTE_GLOBAL_ARRAY_BLOCK_SIZE 64 #define ORTE_GLOBAL_ARRAY_MAX_SIZE INT_MAX @@ -179,7 +200,9 @@ typedef uint16_t orte_job_controls_t; #define ORTE_JOB_CONTROL_CONTINUOUS_OP 0x0040 #define ORTE_JOB_CONTROL_RECOVERABLE 0x0080 #define ORTE_JOB_CONTROL_SPIN_FOR_DEBUG 0x0100 - +#define ORTE_JOB_CONTROL_RESTART 0x0200 +#define ORTE_JOB_CONTROL_PROCS_MIGRATING 0x0400 + /* global type definitions used by RTE - instanced in orte_globals.c */ /************ @@ -316,10 +339,6 @@ ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_node_t); typedef struct { /** Base object so this can be put on a list */ opal_list_item_t super; - /* a name for this job */ - char *name; - /* a name for this instance of the job */ - char *instance; /* jobid for this job */ orte_jobid_t jobid; /* app_context array for this job */ @@ -348,6 +367,8 @@ typedef struct { orte_node_t *bookmark; /* state of the overall job */ orte_job_state_t state; + /* some procs in this job are being restarted */ + bool restart; /* number of procs launched */ orte_vpid_t num_launched; /* number of procs reporting contact info */ @@ -356,10 +377,10 @@ typedef struct { orte_vpid_t num_terminated; /* number of daemons reported launched so we can track progress */ orte_vpid_t num_daemons_reported; - /* lock/cond/flag for tracking when all procs reported on dynamic spawn */ - opal_mutex_t dyn_spawn_lock; - opal_condition_t dyn_spawn_cond; - bool dyn_spawn_active; + /* number of procs with non-zero exit codes */ + int32_t num_non_zero_exit; + /* originator of a dynamic spawn */ + orte_process_name_t originator; /* did this job abort? */ bool abort; /* proc that caused that to happen */ @@ -370,8 +391,13 @@ typedef struct { bool enable_recovery; /* time launch message was sent */ struct timeval launch_msg_sent; + /* time launch message was recvd */ + struct timeval launch_msg_recvd; /* max time for launch msg to be received */ struct timeval max_launch_msg_recvd; + orte_vpid_t num_local_procs; + /* pidmap for delivery to procs */ + opal_byte_object_t *pmap; #if OPAL_ENABLE_FT_CR == 1 /* ckpt state */ size_t ckpt_state; @@ -412,6 +438,10 @@ struct orte_proc_t { orte_proc_state_t last_errmgr_state; /* process state */ orte_proc_state_t state; + /* shortcut for determinng proc has been launched + * and has not yet terminated + */ + bool alive; /* exit code */ orte_exit_code_t exit_code; /* the app_context that generated this proc */ @@ -426,6 +456,12 @@ struct orte_proc_t { #endif /* pointer to the node where this proc is executing */ orte_node_t *node; + /* indicate that this proc is local */ + bool local_proc; + /* indicate that this proc should not barrier - used + * for restarting processes + */ + bool do_not_barrier; /* pointer to the node where this proc last executed */ orte_node_t *prior_node; /* name of the node where this proc is executing - this @@ -447,6 +483,11 @@ struct orte_proc_t { int beat; /* history of resource usage - sized by sensor framework */ opal_ring_buffer_t stats; + /* track finalization */ + bool registered; + bool deregistered; + bool iof_complete; + bool waitpid_recvd; #if OPAL_ENABLE_FT_CR == 1 /* ckpt state */ size_t ckpt_state; @@ -581,6 +622,7 @@ ORTE_DECLSPEC extern opal_buffer_t *orte_tree_launch_cmd; ORTE_DECLSPEC extern opal_pointer_array_t *orte_job_data; ORTE_DECLSPEC extern opal_pointer_array_t *orte_node_pool; ORTE_DECLSPEC extern opal_pointer_array_t *orte_node_topologies; +ORTE_DECLSPEC extern opal_pointer_array_t *orte_local_children; /* a clean output channel without prefix */ ORTE_DECLSPEC extern int orte_clean_output; @@ -588,17 +630,6 @@ ORTE_DECLSPEC extern int orte_clean_output; /* Nidmap and job maps */ ORTE_DECLSPEC extern opal_pointer_array_t orte_nidmap; ORTE_DECLSPEC extern opal_pointer_array_t orte_jobmap; -ORTE_DECLSPEC extern char *orted_launch_cmd; - -/* list of local children on a daemon */ -ORTE_DECLSPEC extern opal_list_t orte_local_children; -ORTE_DECLSPEC extern opal_mutex_t orte_local_children_lock; -ORTE_DECLSPEC extern opal_condition_t orte_local_children_cond; - -/* list of job data for local children on a daemon */ -ORTE_DECLSPEC extern opal_list_t orte_local_jobdata; -ORTE_DECLSPEC extern opal_mutex_t orte_local_jobdata_lock; -ORTE_DECLSPEC extern opal_condition_t orte_local_jobdata_cond; /* whether or not to forward SIGTSTP and SIGCONT signals */ ORTE_DECLSPEC extern bool orte_forward_job_control; @@ -627,12 +658,11 @@ ORTE_DECLSPEC extern char *orte_node_regex; ORTE_DECLSPEC extern bool orte_report_events; ORTE_DECLSPEC extern char *orte_report_events_uri; -/* barrier control */ -ORTE_DECLSPEC extern bool orte_do_not_barrier; - /* process recovery */ ORTE_DECLSPEC extern bool orte_enable_recovery; ORTE_DECLSPEC extern int32_t orte_max_restarts; +/* barrier control */ +ORTE_DECLSPEC extern bool orte_do_not_barrier; /* comm interface */ typedef void (*orte_default_cbfunc_t)(int fd, short event, void *data); @@ -641,17 +671,16 @@ typedef int (*orte_default_comm_fn_t)(orte_process_name_t *recipient, opal_buffer_t *buf, orte_rml_tag_t tag, orte_default_cbfunc_t cbfunc); -/* comm fn for updating state */ -ORTE_DECLSPEC extern orte_default_comm_fn_t orte_comm; -ORTE_DECLSPEC int orte_global_comm(orte_process_name_t *recipient, - opal_buffer_t *buf, orte_rml_tag_t tag, - orte_default_cbfunc_t cbfunc); /* exit status reporting */ ORTE_DECLSPEC extern bool orte_report_child_jobs_separately; ORTE_DECLSPEC extern struct timeval orte_child_time_to_exit; ORTE_DECLSPEC extern bool orte_abort_non_zero_exit; +/* State Machine lists */ +ORTE_DECLSPEC extern opal_list_t orte_job_states; +ORTE_DECLSPEC extern opal_list_t orte_proc_states; + /* length of stat history to keep */ ORTE_DECLSPEC extern int orte_stat_history_size; diff --git a/orte/runtime/orte_init.c b/orte/runtime/orte_init.c index a344ce3658..a12b7d0028 100644 --- a/orte/runtime/orte_init.c +++ b/orte/runtime/orte_init.c @@ -9,7 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2006 Los Alamos National Security, LLC. All rights + * Copyright (c) 2006-2012 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2007-2011 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2007-2008 Sun Microsystems, Inc. All rights reserved. @@ -34,6 +34,7 @@ #include "opal/util/error.h" #include "opal/util/output.h" #include "opal/runtime/opal.h" +#include "opal/threads/threads.h" #include "orte/util/show_help.h" #include "orte/mca/ess/base/base.h" @@ -55,19 +56,16 @@ bool orte_debug_flag = false; int orte_debug_verbosity; char *orte_prohibited_session_dirs = NULL; bool orte_create_session_dirs = true; +opal_event_base_t *orte_event_base; +bool orte_event_base_active = true; -#if ORTE_ENABLE_EPOCH -orte_process_name_t orte_name_wildcard = {ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD}; -#else orte_process_name_t orte_name_wildcard = {ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD}; -#endif -#if ORTE_ENABLE_EPOCH -orte_process_name_t orte_name_invalid = {ORTE_JOBID_INVALID, ORTE_VPID_INVALID, ORTE_EPOCH_INVALID}; -#else orte_process_name_t orte_name_invalid = {ORTE_JOBID_INVALID, ORTE_VPID_INVALID}; -#endif +#if !ORTE_DISABLE_FULL_SUPPORT && ORTE_ENABLE_PROGRESS_THREAD +static void* orte_progress_thread_engine(opal_object_t *obj); +#endif #if OPAL_CC_USE_PRAGMA_IDENT #pragma ident ORTE_IDENT_STRING @@ -130,7 +128,26 @@ int orte_init(int* pargc, char*** pargv, orte_proc_type_t flags) error = "orte_ess_base_select"; goto error; } - + + if (ORTE_PROC_IS_APP) { +#if !ORTE_DISABLE_FULL_SUPPORT && ORTE_ENABLE_PROGRESS_THREAD + /* get a separate orte event base */ + orte_event_base = opal_event_base_create(); + /* fork off a thread to progress it */ + orte_progress_thread.t_run = orte_progress_thread_engine; + if (OPAL_SUCCESS != (ret = opal_thread_start(&orte_progress_thread))) { + error = "orte progress thread start"; + goto error; + } +#else + /* set the event base to the opal one */ + orte_event_base = opal_event_base; +#endif + } else { + /* set the event base to the opal one */ + orte_event_base = opal_event_base; + } + /* initialize the RTE for this environment */ if (ORTE_SUCCESS != (ret = orte_ess.init())) { error = "orte_ess_init"; @@ -141,7 +158,7 @@ int orte_init(int* pargc, char*** pargv, orte_proc_type_t flags) orte_initialized = true; return ORTE_SUCCESS; -error: + error: if (ORTE_ERR_SILENT != ret) { orte_show_help("help-orte-runtime", "orte_init:startup:internal-failure", @@ -151,3 +168,13 @@ error: return ret; } + +#if !ORTE_DISABLE_FULL_SUPPORT && ORTE_ENABLE_PROGRESS_THREAD +static void* orte_progress_thread_engine(opal_object_t *obj) +{ + while (orte_event_base->active) { + opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE); + } + return OPAL_THREAD_CANCELLED; +} +#endif diff --git a/orte/runtime/orte_quit.c b/orte/runtime/orte_quit.c index 61b65b930b..f2105178ff 100644 --- a/orte/runtime/orte_quit.c +++ b/orte/runtime/orte_quit.c @@ -12,7 +12,7 @@ * All rights reserved. * Copyright (c) 2006-2011 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2007-2009 Sun Microsystems, Inc. All rights reserved. - * Copyright (c) 2007 Los Alamos National Security, LLC. All rights + * Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2012 Oak Ridge National Labs. All rights reserved. * $COPYRIGHT$ @@ -51,6 +51,7 @@ #include "orte/mca/plm/plm.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/routed/routed.h" +#include "orte/mca/state/state.h" #include "orte/util/session_dir.h" #include "orte/util/show_help.h" @@ -68,102 +69,75 @@ static int num_aborted = 0; static int num_killed = 0; static int num_failed_start = 0; +static bool errors_reported = false; static void dump_aborted_procs(void); #endif -void orte_jobs_complete(void) +void orte_quit(int fd, short args, void *cbdata) { -#if !ORTE_DISABLE_FULL_SUPPORT - /* check one-time lock to protect against multiple calls */ - if (opal_atomic_trylock(&orte_jobs_complete_lock)) { /* returns 1 if already locked */ - return; + orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; + + /* cleanup */ + if (NULL != caddy) { + OBJ_RELEASE(caddy); } - /* if we never launched, just skip this part to avoid - * meaningless error messages - */ - if (orte_never_launched) { - ORTE_UPDATE_EXIT_STATUS(orte_exit_status); - orte_quit(); - } - - if (0 != orte_exit_status && !orte_execute_quiet) { - /* abnormal termination of some kind */ - dump_aborted_procs(); - /* If we showed more abort messages than were allowed, - show a followup message here */ - if (num_failed_start > 1) { - if (orte_xml_output) { - fprintf(orte_xml_fp, ""); - } - fprintf(orte_xml_fp, "%d total process%s failed to start", - num_failed_start, ((num_failed_start > 1) ? "es" : "")); - if (orte_xml_output) { - fprintf(orte_xml_fp, " "); - } - fprintf(orte_xml_fp, "\n"); - } - if (num_aborted > 1) { - if (orte_xml_output) { - fprintf(orte_xml_fp, ""); - } - fprintf(orte_xml_fp, "%d total process%s aborted", - num_aborted, ((num_aborted > 1) ? "es" : "")); - if (orte_xml_output) { - fprintf(orte_xml_fp, " "); - } - fprintf(orte_xml_fp, "\n"); - } - if (num_killed > 1) { - if (orte_xml_output) { - fprintf(orte_xml_fp, ""); - } - fprintf(orte_xml_fp, "%d total process%s killed (some possibly by %s during cleanup)", - num_killed, ((num_killed > 1) ? "es" : ""), orte_basename); - if (orte_xml_output) { - fprintf(orte_xml_fp, " "); - } - fprintf(orte_xml_fp, "\n"); - } - } - - if (0 < orte_routed.num_routes()) { - orte_plm.terminate_orteds(); - } -#endif -} - -void orte_quit(void) -{ /* check one-time lock to protect against "bounce" */ if (opal_atomic_trylock(&orte_quit_lock)) { /* returns 1 if already locked */ return; } - /* whack any lingering session directory files from our jobs */ - orte_session_dir_cleanup(ORTE_JOBID_WILDCARD); - -#if !ORTE_DISABLE_FULL_SUPPORT - /* cleanup our data server */ - orte_data_server_finalize(); -#endif - - /* cleanup and leave */ - orte_finalize(); - -#if !ORTE_DISABLE_FULL_SUPPORT - if (NULL != orte_basename) { - free(orte_basename); + /* if we are the hnp and haven't already reported it, then + * report any errors + */ + if (ORTE_PROC_IS_HNP && !errors_reported) { + if (0 != orte_exit_status && !orte_execute_quiet) { + errors_reported = true; + /* abnormal termination of some kind */ + dump_aborted_procs(); + /* If we showed more abort messages than were allowed, + show a followup message here */ + if (num_failed_start > 1) { + if (orte_xml_output) { + fprintf(orte_xml_fp, ""); + } + fprintf(orte_xml_fp, "%d total process%s failed to start", + num_failed_start, ((num_failed_start > 1) ? "es" : "")); + if (orte_xml_output) { + fprintf(orte_xml_fp, " "); + } + fprintf(orte_xml_fp, "\n"); + } + if (num_aborted > 1) { + if (orte_xml_output) { + fprintf(orte_xml_fp, ""); + } + fprintf(orte_xml_fp, "%d total process%s aborted", + num_aborted, ((num_aborted > 1) ? "es" : "")); + if (orte_xml_output) { + fprintf(orte_xml_fp, " "); + } + fprintf(orte_xml_fp, "\n"); + } + if (num_killed > 1) { + if (orte_xml_output) { + fprintf(orte_xml_fp, ""); + } + fprintf(orte_xml_fp, "%d total process%s killed (some possibly by %s during cleanup)", + num_killed, ((num_killed > 1) ? "es" : ""), orte_basename); + if (orte_xml_output) { + fprintf(orte_xml_fp, " "); + } + fprintf(orte_xml_fp, "\n"); + } + } } - - if (orte_debug_flag) { - fprintf(stderr, "orterun: exiting with status %d\n", orte_exit_status); - } - exit(orte_exit_status); -#else - exit(0); -#endif + + /* flag that the event lib should no longer be looped + * so we will exit + */ + orte_event_base_active = false; } @@ -193,7 +167,6 @@ static void dump_aborted_procs(void) } if (ORTE_JOB_STATE_UNDEF != job->state && ORTE_JOB_STATE_INIT != job->state && - ORTE_JOB_STATE_LAUNCHED != job->state && ORTE_JOB_STATE_RUNNING != job->state && ORTE_JOB_STATE_TERMINATED != job->state && ORTE_JOB_STATE_ABORT_ORDERED != job->state) { @@ -207,7 +180,8 @@ static void dump_aborted_procs(void) /* array is left-justfied - we are done */ continue; } - if (ORTE_PROC_STATE_FAILED_TO_START == pptr->state) { + if (ORTE_PROC_STATE_FAILED_TO_START == pptr->state || + ORTE_PROC_STATE_FAILED_TO_LAUNCH == pptr->state) { ++num_failed_start; } else if (ORTE_PROC_STATE_ABORTED == pptr->state) { ++num_aborted; @@ -224,7 +198,8 @@ static void dump_aborted_procs(void) approc = (orte_app_context_t*)opal_pointer_array_get_item(job->apps, proc->app_idx); node = proc->node; - if (ORTE_JOB_STATE_FAILED_TO_START == job->state) { + if (ORTE_JOB_STATE_FAILED_TO_START == job->state || + ORTE_JOB_STATE_FAILED_TO_LAUNCH == job->state) { if (NULL == proc) { orte_show_help("help-orterun.txt", "orterun:proc-failed-to-start-no-status-no-node", true, orte_basename); diff --git a/orte/runtime/orte_quit.h b/orte/runtime/orte_quit.h index 631d6665fa..9ed23c9995 100644 --- a/orte/runtime/orte_quit.h +++ b/orte/runtime/orte_quit.h @@ -1,5 +1,7 @@ /* * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2012 Los Alamos National Security, LLC. All rights + * reserved. * * $COPYRIGHT$ * @@ -20,9 +22,7 @@ BEGIN_C_DECLS -ORTE_DECLSPEC void orte_jobs_complete(void); - -ORTE_DECLSPEC void orte_quit(void); +ORTE_DECLSPEC void orte_quit(int fd, short args, void *cbdata); END_C_DECLS diff --git a/orte/runtime/orte_wait.c b/orte/runtime/orte_wait.c index 4f3c46377e..cd68c7285a 100644 --- a/orte/runtime/orte_wait.c +++ b/orte/runtime/orte_wait.c @@ -9,7 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2007 Los Alamos National Security, LLC. All rights + * Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2008 Institut National de Recherche en Informatique * et Automatique. All rights reserved. @@ -65,57 +65,31 @@ #include "orte/runtime/orte_wait.h" +/********************************************************************* +* +* Timer Object Declaration +* +********************************************************************/ +static void timer_const(orte_timer_t *tm) +{ + tm->ev = opal_event_alloc(); + tm->payload = NULL; +} +static void timer_dest(orte_timer_t *tm) +{ + opal_event_free(tm->ev); + free(tm->ev); +} +OBJ_CLASS_INSTANCE(orte_timer_t, + opal_object_t, + timer_const, + timer_dest); /********************************************************************* * * Wait Object Declarations * ********************************************************************/ -static void message_event_destructor(orte_message_event_t *ev) -{ - if (NULL != ev->ev) { - free(ev->ev); - } - if (NULL != ev->buffer) { - OBJ_RELEASE(ev->buffer); - } -#if OPAL_ENABLE_DEBUG - if (NULL != ev->file) { - free(ev->file); - } -#endif -} - -static void message_event_constructor(orte_message_event_t *ev) -{ - ev->ev = (opal_event_t *) malloc(sizeof(opal_event_t)); - ev->buffer = OBJ_NEW(opal_buffer_t); -#if OPAL_ENABLE_DEBUG - ev->file = NULL; -#endif -} - -OBJ_CLASS_INSTANCE(orte_message_event_t, - opal_object_t, - message_event_constructor, - message_event_destructor); - -static void notify_event_destructor(orte_notify_event_t *ev) -{ - if (NULL != ev->ev) { - free(ev->ev); - } -} - -static void notify_event_constructor(orte_notify_event_t *ev) -{ - ev->ev = (opal_event_t *) malloc(sizeof(opal_event_t)); -} -OBJ_CLASS_INSTANCE(orte_notify_event_t, - opal_object_t, - notify_event_constructor, - notify_event_destructor); - #ifdef HAVE_WAITPID static volatile int cb_enabled = true; @@ -198,25 +172,6 @@ static OBJ_CLASS_INSTANCE(pending_pids_item_t, opal_list_item_t, NULL, NULL); static OBJ_CLASS_INSTANCE(registered_cb_item_t, opal_list_item_t, NULL, NULL); -static void -trigger_event_constructor(orte_trigger_event_t *trig) -{ - trig->name = NULL; - trig->channel = -1; - opal_atomic_init(&trig->lock, OPAL_ATOMIC_UNLOCKED); -} -static void -trigger_event_destructor(orte_trigger_event_t *trig) -{ - if (NULL != trig->name) { - free(trig->name); - } -} -OBJ_CLASS_INSTANCE(orte_trigger_event_t, - opal_object_t, - trigger_event_constructor, - trigger_event_destructor); - /********************************************************************* * * Local Variables @@ -266,10 +221,11 @@ orte_wait_init(void) OBJ_CONSTRUCT(&pending_pids, opal_list_t); OBJ_CONSTRUCT(®istered_cb, opal_list_t); - opal_event_set(opal_event_base, + opal_event_set(orte_event_base, &handler, SIGCHLD, OPAL_EV_SIGNAL|OPAL_EV_PERSIST, orte_wait_signal_callback, &handler); + opal_event_set_priority(&handler, ORTE_SYS_PRI); opal_event_add(&handler, NULL); return ORTE_SUCCESS; @@ -378,7 +334,7 @@ orte_waitpid(pid_t wpid, int *status, int options) #if OPAL_HAVE_POSIX_THREADS && ORTE_ENABLE_PROGRESS_THREADS if (opal_using_threads()) { opal_mutex_unlock(&mutex); - opal_event_loop(opal_event_base, OPAL_EVLOOP_NONBLOCK); + opal_event_loop(orte_event_base, OPAL_EVLOOP_NONBLOCK); opal_mutex_lock(&mutex); } #endif @@ -403,7 +359,7 @@ orte_waitpid(pid_t wpid, int *status, int options) for long. */ if (!OPAL_ENABLE_MULTI_THREADS) { - opal_event_loop(opal_event_base, OPAL_EVLOOP_NONBLOCK); + opal_event_loop(orte_event_base, OPAL_EVLOOP_NONBLOCK); } } @@ -493,60 +449,6 @@ orte_wait_cb_enable() return ORTE_SUCCESS; } - -int orte_wait_event(opal_event_t **event, orte_trigger_event_t *trig, - char *trigger_name, - void (*cbfunc)(int, short, void*)) -{ - int p[2]; - - if (pipe(p) < 0) { - ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_PIPES); - return ORTE_ERR_SYS_LIMITS_PIPES; - } - - /* save the trigger name */ - trig->name = strdup(trigger_name); - - /* create the event */ - *event = (opal_event_t *) malloc(sizeof(opal_event_t)); - - /* pass back the write end of the pipe */ - trig->channel = p[1]; - - /* define the event to fire when someone writes to the pipe */ - opal_event_set(opal_event_base, *event, p[0], OPAL_EV_READ, cbfunc, trig); - - /* Add it to the active events, without a timeout */ - opal_event_add(*event, NULL); - - /* all done */ - return ORTE_SUCCESS; -} - - -void orte_trigger_event(orte_trigger_event_t *trig) -{ - int data=1; - - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, - "%s calling %s trigger", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - trig->name)); - - /* if we already fired it, don't do it again - this automatically - * records that we did fire it - */ - if (opal_atomic_trylock(&trig->lock)) { /* returns 1 if already locked */ - return; - } - - write(trig->channel, &data, sizeof(int)); - close(trig->channel); - opal_progress(); -} - - /********************************************************************* * * Local Functions @@ -776,25 +678,6 @@ static void opal_process_handle_destruct( opal_object_t* obj ) static OBJ_CLASS_INSTANCE( opal_process_handle_t, opal_list_item_t, opal_process_handle_construct, opal_process_handle_destruct ); -static void -trigger_event_constructor(orte_trigger_event_t *trig) -{ - trig->name = NULL; - trig->channel = -1; - opal_atomic_init(&trig->lock, OPAL_ATOMIC_UNLOCKED); -} -static void -trigger_event_destructor(orte_trigger_event_t *trig) -{ - if (NULL != trig->name) { - free(trig->name); - } -} -OBJ_CLASS_INSTANCE(orte_trigger_event_t, - opal_object_t, - trigger_event_constructor, - trigger_event_destructor); - /********************************************************************* * * Interface Functions @@ -832,25 +715,6 @@ orte_wait_finalize(void) return ORTE_SUCCESS; } -void orte_trigger_event(orte_trigger_event_t *trig) -{ - int data=1; - - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, - "%s calling %s trigger", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - trig->name)); - - if (opal_atomic_trylock(&trig->lock)) { /* returns 1 if already locked */ - return; - } - - send(trig->channel, (const char*)&data, sizeof(int), 0); - closesocket(trig->channel); - opal_progress(); -} - - /** * Internal function which find a corresponding process structure * based on the pid. If create is true and the pid does not have a @@ -1044,38 +908,6 @@ orte_wait_cb_enable(void) } -int orte_wait_event(opal_event_t **event, orte_trigger_event_t *trig, - char *trigger_name, - void (*cbfunc)(int, short, void*)) -{ - int p[2]; - - if (create_socketpair(AF_UNIX, SOCK_STREAM, 0, p) == -1) { - return ORTE_ERROR; - } - - /* save the trigger name */ - trig->name = strdup(trigger_name); - - /* create the event */ - *event = (opal_event_t *) malloc(sizeof(opal_event_t)); - - /* setup the trigger and its associated lock */ - OBJ_CONSTRUCT(trig, orte_trigger_event_t); - - /* pass back the write end of the pipe */ - trig->channel = p[1]; - - /* define the event to fire when someone writes to the pipe */ - opal_event_set(opal_event_base, *event, p[0], OPAL_EV_READ, cbfunc, NULL); - - /* Add it to the active events, without a timeout */ - opal_event_add(*event, NULL); - - /* all done */ - return ORTE_SUCCESS; -} - int orte_wait_kill(int sig) @@ -1156,18 +988,6 @@ orte_wait_cb_enable(void) return ORTE_ERR_NOT_SUPPORTED; } -void orte_trigger_event(orte_trigger_event_t *trig) -{ -} - -int -orte_wait_event(opal_event_t **event, int *trig, - char *trigger_name, - void (*cbfunc)(int, short, void*)) -{ - return ORTE_ERR_NOT_SUPPORTED; -} - int orte_wait_kill(int sig) { diff --git a/orte/runtime/orte_wait.h b/orte/runtime/orte_wait.h index 5335461b1c..26bda27ad9 100644 --- a/orte/runtime/orte_wait.h +++ b/orte/runtime/orte_wait.h @@ -11,6 +11,8 @@ * All rights reserved. * Copyright (c) 2008 Institut National de Recherche en Informatique * et Automatique. All rights reserved. + * Copyright (c) 2011 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -47,14 +49,6 @@ BEGIN_C_DECLS -typedef struct { - opal_object_t super; - char *name; - int channel; - opal_atomic_lock_t lock; -} orte_trigger_event_t; -ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_trigger_event_t); - /** typedef for callback function used in \c ompi_rte_wait_cb */ typedef void (*orte_wait_fn_t)(pid_t wpid, int status, void *data); @@ -104,170 +98,14 @@ ORTE_DECLSPEC int orte_wait_cb_disable(void); ORTE_DECLSPEC int orte_wait_cb_enable(void); -/** - * Setup to wait for an event - * - * This function is used to setup a trigger event that can be used elsewhere - * in the code base where we want to wait for some event to - * happen. For example, orterun uses this function to setup an event - * that is used to notify orterun of abnormal and normal termination - * so it can wakeup and exit cleanly. - * - * The event will be defined so that firing the provided trigger - * will cause the event to trigger and callback to the provided - * function - */ -ORTE_DECLSPEC int orte_wait_event(opal_event_t **event, - orte_trigger_event_t *trig, - char *trigger_name, - void (*cbfunc)(int, short, void*)); - -/** - * In a number of places in the code, we need to wait for something - * to complete - for example, waiting for all launched procs to - * report into the HNP. In such cases, we want to just call progress - * so that any messages get processed, but otherwise "hold" the - * program at this spot until the counter achieves the specified - * value. We also want to provide a boolean flag, though, so that - * we break out of the loop should something go wrong. - */ -#define ORTE_PROGRESSED_WAIT(failed, counter, limit) \ - do { \ - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, \ - "progressed_wait: %s %d", \ - __FILE__, __LINE__)); \ - while (!(failed) && (counter) < (limit)) { \ - opal_progress(); \ - } \ - } while(0); \ - - -/** - * Trigger a defined event - * - * This function will trigger a previously-defined event - as setup - * by orte_wait_event - by firing the provided trigger - */ -ORTE_DECLSPEC void orte_trigger_event(orte_trigger_event_t *trig); - -/** - * Setup an event to process a message - * - * If we are in an OOB recv callback, we frequently cannot process - * the received message until after we return from the callback to - * avoid a potential loopback situation - i.e., where processing - * the message can result in a message being sent somewhere that - * subsequently causes the recv we are in to get called again. - * This is typically the problem facing the daemons and HNP. - * - * To resolve this problem, we place the message to be processed on - * a list, and create a zero-time event that calls the function - * that will process the received message. The event library kindly - * does not trigger this event until after we return from the recv - * since the recv itself is considered an "event"! Thus, we will - * always execute the specified event cb function -after- leaving - * the recv. - */ +/* define an object for timer events */ typedef struct { opal_object_t super; + struct timeval tv; opal_event_t *ev; - orte_process_name_t sender; - opal_buffer_t *buffer; - orte_rml_tag_t tag; -#if OPAL_ENABLE_DEBUG - char *file; - int line; -#endif -} orte_message_event_t; -ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_message_event_t); - -#define ORTE_MESSAGE_EVENT_DELAY(delay, mev) \ - do { \ - struct timeval now; \ - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, \ - "defining message event delay: %s %d", \ - __FILE__, __LINE__)); \ - now.tv_sec = delay/1000000; \ - now.tv_usec = delay%1000000; \ - opal_event_evtimer_add(mev->ev, &now); \ - } while(0); - -#if OPAL_ENABLE_DEBUG - -#define ORTE_MESSAGE_EVENT(sndr, buf, tg, cbfunc) \ - do { \ - orte_message_event_t *mev; \ - struct timeval now; \ - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, \ - "defining message event: %s %d", \ - __FILE__, __LINE__)); \ - mev = OBJ_NEW(orte_message_event_t); \ - mev->sender.jobid = (sndr)->jobid; \ - mev->sender.vpid = (sndr)->vpid; \ - ORTE_EPOCH_SET(mev->sender.epoch,(sndr)->epoch); \ - opal_dss.copy_payload(mev->buffer, (buf)); \ - mev->tag = (tg); \ - mev->file = strdup((buf)->parent.cls_init_file_name); \ - mev->line = (buf)->parent.cls_init_lineno; \ - opal_event_evtimer_set(opal_event_base, \ - mev->ev, (cbfunc), mev); \ - now.tv_sec = 0; \ - now.tv_usec = 0; \ - opal_event_evtimer_add(mev->ev, &now); \ - } while(0); - -#else - -#define ORTE_MESSAGE_EVENT(sndr, buf, tg, cbfunc) \ - do { \ - orte_message_event_t *mev; \ - struct timeval now; \ - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, \ - "defining message event: %s %d", \ - __FILE__, __LINE__)); \ - mev = OBJ_NEW(orte_message_event_t); \ - mev->sender.jobid = (sndr)->jobid; \ - mev->sender.vpid = (sndr)->vpid; \ - ORTE_EPOCH_SET(mev->sender.epoch,(sndr)->epoch); \ - opal_dss.copy_payload(mev->buffer, (buf)); \ - mev->tag = (tg); \ - opal_event_evtimer_set(opal_event_base, \ - mev->ev, (cbfunc), mev); \ - now.tv_sec = 0; \ - now.tv_usec = 0; \ - opal_event_evtimer_add(mev->ev, &now); \ - } while(0); - -#endif - -/* Sometimes, we just need to get out of the event library so - * we can progress - and we need to pass a little info. For those - * cases, we define a zero-time event that passes info to a cbfunc - */ -typedef struct { - opal_object_t super; - opal_event_t *ev; - orte_process_name_t proc; -} orte_notify_event_t; -ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_notify_event_t); - -#define ORTE_NOTIFY_EVENT(cbfunc, data) \ - do { \ - struct timeval now; \ - orte_notify_event_t *tmp; \ - tmp = OBJ_NEW(orte_notify_event_t); \ - tmp->proc.jobid = (data)->jobid; \ - tmp->proc.vpid = (data)->vpid; \ - ORTE_EPOCH_SET(tmp->proc.epoch,(data)->epoch); \ - opal_event.evtimer_set(opal_event_base, \ - tmp->ev, (cbfunc), tmp); \ - now.tv_sec = 0; \ - now.tv_usec = 0; \ - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, \ - "defining notify event at %s:%d", \ - __FILE__, __LINE__)); \ - opal_event_evtimer_add(tmp->ev, &now); \ - } while(0); \ + void *payload; +} orte_timer_t; +OBJ_CLASS_DECLARATION(orte_timer_t); /** * In a number of places within the code, we want to setup a timer @@ -284,27 +122,30 @@ ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_notify_event_t); * less than 1M since some systems care about that, and to ensure * that the computed wait time doesn't exceed the desired max * wait + * + * NOTE: the callback function is responsible for releasing the timer + * event back to the event pool! */ -#define ORTE_DETECT_TIMEOUT(event, n, deltat, maxwait, cbfunc) \ +#define ORTE_DETECT_TIMEOUT(n, deltat, maxwait, cbfunc, cbd) \ do { \ - struct timeval now; \ - opal_event_t *tmp; \ + orte_timer_t *tmp; \ int timeout; \ - tmp = (opal_event_t *) malloc(sizeof(opal_event_t)); \ - opal_event_evtimer_set(opal_event_base, \ - tmp, (cbfunc), tmp); \ + tmp = OBJ_NEW(orte_timer_t); \ + tmp->payload = (cbd); \ + opal_event_evtimer_set(orte_event_base, \ + tmp->ev, (cbfunc), tmp); \ + opal_event_set_priority(tmp->ev, ORTE_ERROR_PRI); \ timeout = (deltat) * (n); \ if ((maxwait) > 0 && timeout > (maxwait)) { \ timeout = (maxwait); \ } \ - now.tv_sec = timeout/1000000; \ - now.tv_usec = timeout%1000000; \ + tmp->tv.tv_sec = timeout/1000000; \ + tmp->tv.tv_usec = timeout%1000000; \ OPAL_OUTPUT_VERBOSE((1, orte_debug_output, \ "defining timeout: %ld sec %ld usec at %s:%d", \ - (long)now.tv_sec, (long)now.tv_usec, \ + (long)tmp->tv.tv_sec, (long)tmp->tv.tv_usec, \ __FILE__, __LINE__)); \ - opal_event_evtimer_add(tmp, &now); \ - *(event) = tmp; \ + opal_event_evtimer_add(tmp->ev, &tmp->tv); \ }while(0); \ @@ -312,21 +153,25 @@ ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_notify_event_t); * There are places in the code where we just want to periodically * wakeup to do something, and then go back to sleep again. Setting * a timer allows us to do this + * + * NOTE: the callback function is responsible for releasing the timer + * event back to the event pool when done! Otherwise, the finalize + * function will take care of it. */ -#define ORTE_TIMER_EVENT(sec, usec, cbfunc) \ +#define ORTE_TIMER_EVENT(sec, usec, cbfunc, pri) \ do { \ - struct timeval now; \ - opal_event_t *tmp; \ - tmp = (opal_event_t *) malloc(sizeof(opal_event_t)); \ - opal_event_evtimer_set(opal_event_base, \ - tmp, (cbfunc), tmp); \ - now.tv_sec = (sec); \ - now.tv_usec = (usec); \ + orte_timer_t *tm; \ + tm = OBJ_NEW(orte_timer_t); \ + opal_event_evtimer_set(orte_event_base, \ + tm->ev, (cbfunc), tm); \ + opal_event_set_priority(tm->ev, (pri)); \ + tm->tv.tv_sec = (sec) + (usec)/1000000; \ + tm->tv.tv_usec = (usec) % 1000000; \ OPAL_OUTPUT_VERBOSE((1, orte_debug_output, \ - "defining timer event: %ld sec %ld usec at %s:%d", \ - (long)now.tv_sec, (long)now.tv_usec, \ - __FILE__, __LINE__)); \ - opal_event_evtimer_add(tmp, &now); \ + "defining timer event: %ld sec %ld usec at %s:%d", \ + (long)tm->tv.tv_sec, (long)tm->tv.tv_usec, \ + __FILE__, __LINE__)); \ + opal_event_evtimer_add(tm->ev, &tm->tv); \ }while(0); \ diff --git a/orte/test/mpi/hello.c b/orte/test/mpi/hello.c index c73f4a56ea..48e3c8ba3c 100644 --- a/orte/test/mpi/hello.c +++ b/orte/test/mpi/hello.c @@ -24,7 +24,7 @@ int main(int argc, char* argv[]) rc = hwloc_get_cpubind(opal_hwloc_topology, cpus, HWLOC_CPUBIND_PROCESS); hwloc_bitmap_list_asprintf(&bindings, cpus); - printf("Hello, World, I am %d of %d: rc %d bitmap %s\n", rank, size, rc, + printf("Hello, World, I am %d of %d: get_cpubind: %d bitmap %s\n", rank, size, rc, (NULL == bindings) ? "NULL" : bindings); MPI_Finalize(); diff --git a/orte/test/system/Makefile b/orte/test/system/Makefile index 5abca14727..ecd7af09b9 100644 --- a/orte/test/system/Makefile +++ b/orte/test/system/Makefile @@ -1,4 +1,4 @@ -PROGS = no_op sigusr_trap spin orte_nodename orte_spawn orte_loop_spawn orte_loop_child orte_abort get_limits orte_ring spawn_child orte_tool orte_no_op binom oob_stress iof_stress iof_delay radix orte_barrier orte_mcast opal_interface mcast mcast_recv orte_spin segfault sysinfo orte_exit orte_db orte_sensor test-time event-threads psm_keygen regex orte_errors +PROGS = no_op sigusr_trap spin orte_nodename orte_spawn orte_loop_spawn orte_loop_child orte_abort get_limits orte_ring spawn_child orte_tool orte_no_op binom oob_stress iof_stress iof_delay radix orte_barrier orte_mcast opal_interface mcast mcast_recv orte_spin segfault sysinfo orte_exit orte_db orte_sensor test-time event-threads psm_keygen regex orte_errors evpri-test opal-evpri-test all: $(PROGS) diff --git a/orte/test/system/event-threads.c b/orte/test/system/event-threads.c index 1cd61b82df..f09502ef15 100644 --- a/orte/test/system/event-threads.c +++ b/orte/test/system/event-threads.c @@ -15,7 +15,7 @@ #include "opal/runtime/opal.h" #include "opal/mca/event/event.h" -static opal_event_base_t *my_base=NULL; +static orte_event_base_t *my_base=NULL; static opal_thread_t progress_thread; static bool progress_thread_stop=false; static int progress_thread_pipe[2]; @@ -42,7 +42,7 @@ int main(int argc, char **argv) opal_event_use_threads(); /* create a new base */ - my_base = opal_event_base_create(); + my_base = orte_event_base_create(); /* launch a progress thread on that base*/ pipe(progress_thread_pipe); @@ -52,7 +52,7 @@ int main(int argc, char **argv) progress_thread.t_run = progress_engine; if (OPAL_SUCCESS != opal_thread_start(&progress_thread)) { fprintf(stderr, "Unable to start progress thread\n"); - opal_event_base_finalize(my_base); + orte_event_base_finalize(my_base); exit(1); } diff --git a/orte/test/system/evpri-test.c b/orte/test/system/evpri-test.c new file mode 100644 index 0000000000..cc222d6e52 --- /dev/null +++ b/orte/test/system/evpri-test.c @@ -0,0 +1,108 @@ + +#include +#include + +#include +#include +#include +#include + +#define SIGPRI 0 +#define TERMPRI 1 + +static struct event_base *base; +static bool run=true; +static int loops=0; + +static void +cbfunc(evutil_socket_t fd, short what, void *arg) +{ + fprintf(stderr, "CAUGHT SIGNAL\n"); + fflush(stderr); +#if 0 + event_base_loopbreak(base); +#endif + run = false; +} + +static void +die(const char *msg) +{ + fprintf(stderr, "%s\n", msg); + fflush(stderr); + exit(1); +} + +static void +t1func(evutil_socket_t fd, short what, void *arg) +{ + struct event *t1 = (struct event*)arg; + struct event *t2; + + fprintf(stderr, "CAUGHT EVENT\n"); + fflush(stderr); + event_del(t1); + free(t1); + loops++; + if (loops < 10) { + t2 = (struct event*)malloc(sizeof(struct event)); + if (event_assign(t2, base, -1, EV_WRITE, t1func, t2) < 0) { + die("event_assign_term"); + } + if (event_priority_set(t2, TERMPRI) < 0) { + die("event_priority_set_term"); + } + fprintf(stderr, "EVENT %d DEFINED\n", loops); + fflush(stderr); + event_active(t2, EV_WRITE, 1); + fprintf(stderr, "EVENT %d ACTIVATED\n", loops); + fflush(stderr); + } +} + +int +main(int argc, char **argv) +{ + struct event ev; + struct event *t1; + + event_enable_debug_mode(); + + fprintf(stderr, "Libevent %s\n", event_get_version()); + fflush(stderr); + + if (!(base = event_base_new())) + die("event_base_new"); + if (event_base_priority_init(base, 8) < 0) + die("event_base_priority_init"); + if (event_assign(&ev, base, SIGTERM, EV_SIGNAL|EV_PERSIST, cbfunc, NULL)<0) + die("event_assign"); + if (event_priority_set(&ev, SIGPRI) < 0) + die("event_priority_set"); + if (event_add(&ev, NULL) < 0) + die("event_add"); + fprintf(stderr, "SIGNAL EVENT DEFINED\n"); + fflush(stderr); + + t1 = (struct event*)malloc(sizeof(struct event)); + if (event_assign(t1, base, -1, EV_WRITE, t1func, t1) < 0) { + die("event_assign_term"); + } + if (event_priority_set(t1, TERMPRI) < 0) { + die("event_priority_set_term"); + } + event_active(t1, EV_WRITE, 1); + fprintf(stderr, "FIRST TERMINATION EVENT DEFINED\n"); + fflush(stderr); + + /* event_dispatch(base); */ + + while (run) { + event_base_loop(base, EVLOOP_ONCE); + } + + fprintf(stderr, "EXITED LOOP - FREEING BASE\n"); + fflush(stderr); + event_base_free(base); + return 0; +} diff --git a/orte/test/system/evthread-test.c b/orte/test/system/evthread-test.c index a8b07f4649..58c464679d 100644 --- a/orte/test/system/evthread-test.c +++ b/orte/test/system/evthread-test.c @@ -11,23 +11,27 @@ #endif #include -#include +#include "opal/threads/threads.h" +#include "opal/runtime/opal.h" +#include "opal/mca/event/event.h" -#include -#include -#include -#include - -static struct event_base *my_base=NULL; -static pthread_t progress_thread; +static orte_event_base_t *my_base=NULL; +static opal_thread_t progress_thread; static bool progress_thread_stop=false; static int progress_thread_pipe[2]; -static pthread_mutex_t lock; -static struct event write_event; -static int my_fd; +static opal_mutex_t lock; +static opal_condition_t cond; +static bool active=false; +typedef struct { + opal_object_t super; + opal_event_t write_event; +} foo_caddy_t; +OBJ_CLASS_INSTANCE(foo_caddy_t, + opal_object_t, + NULL, NULL); static bool fd_written=false; -static void* progress_engine(void *obj); +static void* progress_engine(opal_object_t *obj); static void send_handler(int sd, short flags, void *arg); int main(int argc, char **argv) @@ -35,29 +39,28 @@ int main(int argc, char **argv) char byte='a'; struct timespec tp={0, 100}; int count=0; + foo_caddy_t *foo; + + /* Initialize the event library */ + opal_init(&argc, &argv); /* setup for threads */ - evthread_use_pthreads(); + opal_event_use_threads(); /* create a new base */ - my_base = event_base_new(); + my_base = orte_event_base_create(); - /* launch a progress thread on that base*/ + /* launch a progress thread on that base*/ pipe(progress_thread_pipe); - - if (pthread_mutex_init(&lock, NULL)) { - fprintf(stderr, "pthread_mutex_init failed\n"); - exit(1); + OBJ_CONSTRUCT(&lock, opal_mutex_t); + OBJ_CONSTRUCT(&cond, opal_condition_t); + OBJ_CONSTRUCT(&progress_thread, opal_thread_t); + progress_thread.t_run = progress_engine; + if (OPAL_SUCCESS != opal_thread_start(&progress_thread)) { + fprintf(stderr, "Unable to start progress thread\n"); + orte_event_base_finalize(my_base); + exit(1); } - if (pthread_create(&progress_thread, NULL, progress_engine, - NULL)) { - fprintf(stderr, "pthread_create failed\n"); - exit(1); - } - /* - pthread starts the thread running itself; no need to do anything to - launch it. - */ /* wait a little while - reflects reality in an async system */ while (count < 100) { @@ -66,39 +69,17 @@ int main(int argc, char **argv) } count=0; -#ifdef WAKE_WITH_EVENT /* make a dummy event */ fprintf(stderr, "activating the write_event"); - event_assign(&write_event, - my_base, - -1, - 0, - send_handler, - NULL); + foo = OBJ_NEW(foo_caddy_t); + opal_event_set(my_base, + &foo->write_event, + -1, + 0, + send_handler, + foo); /* activate it. */ - event_active(&write_event, EV_WRITE, 1); -#else - fprintf(stderr, "opening the file"); - /* define a file descriptor event - looks like an incoming socket - * connection being created, if we're lucky. - */ - my_fd = open("foo", O_CREAT | O_TRUNC | O_RDWR, 0644); - if (my_fd <0) { - perror("open"); - exit(1); - } - event_assign(&write_event, - my_base, - my_fd, - EV_WRITE|EV_PERSIST, - send_handler, - NULL); - event_add(&write_event, NULL); - if (write(progress_thread_pipe[1], &byte, 1) < 0) { - perror("write"); - exit(1); - } -#endif + opal_event_active(&foo->write_event, EV_WRITE, 1); /* wait for it to trigger */ while (!fd_written && count < 1000) { @@ -110,13 +91,16 @@ int main(int argc, char **argv) } /* stop the thread */ - pthread_mutex_lock(&lock); + OPAL_ACQUIRE_THREAD(&lock, &cond, &active); progress_thread_stop = true; - pthread_mutex_unlock(&lock); - - write(progress_thread_pipe[1], &byte, 1); - pthread_join(progress_thread, NULL); + OPAL_RELEASE_THREAD(&lock, &cond, &active); + opal_fd_write(progress_thread_pipe[1], 1, &byte); + opal_thread_join(&progress_thread, NULL); + /* release the base */ + fprintf(stderr, "Cleaning up\n"); + opal_finalize(); + fprintf(stderr, "Cleanup completed\n"); return 0; } @@ -124,56 +108,47 @@ static struct event stop_event; static void stop_handler(int sd, short flags, void* cbdata) { char byte; - int n; - if ((n = read(progress_thread_pipe[0], &byte, 1)) <= 0) { - if (n == 0) - fprintf(stderr, "got a close\n"); - else - perror("read"); - } + opal_fd_read(progress_thread_pipe[0], 1, &byte); + fprintf(stderr, "Stop handler called\n"); /* reset the event */ - event_add(&stop_event, NULL); + opal_event_add(&stop_event, 0); return; } -static void* progress_engine(void *obj) +static void* progress_engine(opal_object_t *obj) { /* define an event that will be used to kick us out of a blocking * situation when we want to exit */ - event_assign(&stop_event, my_base, - progress_thread_pipe[0], EV_READ, stop_handler, NULL); - event_add(&stop_event, NULL); + /* define an event that will be used to kick us out of a blocking + * situation when we want to exit + */ + opal_event_set(my_base, &stop_event, + progress_thread_pipe[0], OPAL_EV_READ, stop_handler, NULL); + opal_event_add(&stop_event, 0); while (1) { - pthread_mutex_lock(&lock); + OPAL_ACQUIRE_THREAD(&lock, &cond, &active); if (progress_thread_stop) { fprintf(stderr, "Thread stopping\n"); - pthread_mutex_unlock(&lock); /* moved this */ - event_del(&stop_event); - return (void*)1; + OPAL_RELEASE_THREAD(&lock, &cond, &active); + opal_event_del(&stop_event); + return OPAL_THREAD_CANCELLED; } - pthread_mutex_unlock(&lock); + OPAL_RELEASE_THREAD(&lock, &cond, &active); fprintf(stderr, "Looping...\n"); - event_base_loop(my_base, EVLOOP_ONCE); + opal_event_loop(my_base, OPAL_EVLOOP_ONCE); } } static void send_handler(int sd, short flags, void *arg) { -#ifdef WAKE_WITH_EVENT + foo_caddy_t *foo = (foo_caddy_t*)arg; + fprintf(stderr, "Deleting event\n"); + opal_event_del(&foo->write_event); + OBJ_RELEASE(foo); fprintf(stderr, "Write event fired\n"); -#else - char *bytes="This is an output string\n"; - - fprintf(stderr, "Write event fired\n"); - if (write(my_fd, bytes, strlen(bytes)) < 0) { - perror("write"); - exit(1); - } - event_del(&write_event); -#endif fd_written = true; /* This needs a lock around it if you are reading it * in the main thread and changing it here XXX */ } diff --git a/orte/test/system/oob_stress.c b/orte/test/system/oob_stress.c index 853d393bcc..f3e08d0203 100644 --- a/orte/test/system/oob_stress.c +++ b/orte/test/system/oob_stress.c @@ -74,7 +74,6 @@ main(int argc, char *argv[]){ for (j=1; j < count+1; j++) { peer.vpid = (ORTE_PROC_MY_NAME->vpid + j) % orte_process_info.num_procs; - ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer)); /* rank0 starts ring */ if (ORTE_PROC_MY_NAME->vpid == 0) { diff --git a/orte/test/system/opal-evpri-test.c b/orte/test/system/opal-evpri-test.c new file mode 100644 index 0000000000..70fd2313de --- /dev/null +++ b/orte/test/system/opal-evpri-test.c @@ -0,0 +1,107 @@ +#include +#include +#include +#include + +#include "opal/mca/event/event.h" + +#include "orte/mca/state/state_types.h" + +#define SIGPRI 0 +#define TERMPRI 1 + +static bool run=true; +static int loops=0; + +static void +cbfunc(evutil_socket_t fd, short what, void *arg) +{ + fprintf(stderr, "CAUGHT SIGNAL\n"); + fflush(stderr); +#if 0 + event_base_loopbreak(base); +#endif + run = false; +} + +static void +die(const char *msg) +{ + fprintf(stderr, "%s\n", msg); + fflush(stderr); + exit(1); +} + +static void +t1func(evutil_socket_t fd, short what, void *arg) +{ + orte_state_caddy_t *caddy = (orte_state_caddy_t*)arg; + orte_state_caddy_t *c2; + + fprintf(stderr, "CAUGHT EVENT\n"); + fflush(stderr); + loops++; + if (loops < 10) { + c2 = OBJ_NEW(orte_state_caddy_t); + opal_event_set(orte_event_base, &c2->ev, -1, OPAL_EV_READ, t1func, c2); + opal_event_set_priority(&c2->ev, ORTE_SYS_PRI); + + fprintf(stderr, "EVENT %d DEFINED\n", loops); + fflush(stderr); + opal_event_active(&c2->ev, OPAL_EV_WRITE, 1); + fprintf(stderr, "EVENT %d ACTIVATED\n", loops); + fflush(stderr); + } + + OBJ_RELEASE(caddy); +} + +int +main(int argc, char **argv) +{ + opal_event_t ev1, ev2; + orte_state_caddy_t *caddy; + + opal_init(); + + /* assign some signal traps */ + if (opal_event_signal_set(orte_event_base, &ev1, SIGTERM, cbfunc, &ev1) < 0) { + die("event_assign"); + } + if (opal_event_set_priority(&ev1, ORTE_ERROR_PRI) < 0) { + die("event_set_pri"); + } + if (opal_event_signal_add(&ev1, NULL) < 0) { + die("event_add"); + } + if (opal_event_signal_set(orte_event_base, &ev2, SIGPIPE, cbfunc, &ev2) < 0) { + die("event_assign"); + } + if (opal_event_set_priority(&ev2, ORTE_ERROR_PRI) < 0) { + die("event_assign"); + } + if (opal_event_signal_add(&ev2, NULL) < 0) { + die("event_assign"); + } + fprintf(stderr, "SIGNAL EVENTS DEFINED\n"); + fflush(stderr); + + /* assign a state event */ + caddy = OBJ_NEW(orte_state_caddy_t); + opal_event_set(orte_event_base, &caddy->ev, -1, OPAL_EV_READ, t1func, caddy); + opal_event_set_priority(&caddy->ev, ORTE_SYS_PRI); + opal_event_active(&caddy->ev, OPAL_EV_WRITE, 1); + fprintf(stderr, "FIRST EVENT DEFINED AND ACTIVATED\n"); + fflush(stderr); + + /* event_dispatch(base); */ + + while (run) { + opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE); + } + + fprintf(stderr, "EXITED LOOP - FINALIZING\n"); + fflush(stderr); + opal_finalize(); + return 0; +} diff --git a/orte/test/system/orte_mcast.c b/orte/test/system/orte_mcast.c index 0daac73e78..928ee94136 100644 --- a/orte/test/system/orte_mcast.c +++ b/orte/test/system/orte_mcast.c @@ -136,7 +136,7 @@ int main(int argc, char* argv[]) } orte_grpcomm.barrier(); /* ensure the public recv is ready */ } - opal_event_dispatch(opal_event_base); + opal_event_dispatch(orte_event_base); blast: orte_finalize(); diff --git a/orte/test/system/orte_ring.c b/orte/test/system/orte_ring.c index 3552283025..081ab0c69d 100644 --- a/orte/test/system/orte_ring.c +++ b/orte/test/system/orte_ring.c @@ -41,14 +41,12 @@ main(int argc, char *argv[]){ if( right_peer_orte_name.vpid >= num_peers ) { right_peer_orte_name.vpid = 0; } - ORTE_EPOCH_SET(right_peer_orte_name.epoch,orte_ess.proc_get_epoch(&right_peer_orte_name)); left_peer_orte_name.jobid = ORTE_PROC_MY_NAME->jobid; left_peer_orte_name.vpid = ORTE_PROC_MY_NAME->vpid - 1; if( ORTE_PROC_MY_NAME->vpid == 0 ) { left_peer_orte_name.vpid = num_peers - 1; } - ORTE_EPOCH_SET(left_peer_orte_name.epoch,orte_ess.proc_get_epoch(&left_peer_orte_name)); printf("My name is: %s -- PID %d\tMy Left Peer is %s\tMy Right Peer is %s\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), getpid(), diff --git a/orte/test/system/orte_sensor.c b/orte/test/system/orte_sensor.c index 3256518e1d..7129c041c3 100644 --- a/orte/test/system/orte_sensor.c +++ b/orte/test/system/orte_sensor.c @@ -40,7 +40,7 @@ int main(int argc, char* argv[]) orte_sensor.start(ORTE_JOBID_INVALID); /* just sit here, letting the sensors run */ - opal_event_dispatch(opal_event_base); + opal_event_dispatch(orte_event_base); orte_finalize(); return 0; diff --git a/orte/test/system/orte_spawn.c b/orte/test/system/orte_spawn.c index 391305bdc2..4b3ea48986 100644 --- a/orte/test/system/orte_spawn.c +++ b/orte/test/system/orte_spawn.c @@ -74,8 +74,6 @@ int main(int argc, char* argv[]) for (i=0; i < app->num_procs; i++) { name.vpid = i; - ORTE_EPOCH_SET(name.epoch,orte_ess.proc_get_epoch(&name)); - fprintf(stderr, "Parent: sending message to child %s\n", ORTE_NAME_PRINT(&name)); if (0 > (rc = orte_rml.send(&name, &msg, 1, MY_TAG, 0))) { ORTE_ERROR_LOG(rc); diff --git a/orte/test/system/orte_spin.c b/orte/test/system/orte_spin.c index 169fe1b303..168ce8b118 100644 --- a/orte/test/system/orte_spin.c +++ b/orte/test/system/orte_spin.c @@ -22,7 +22,7 @@ int main(int argc, char* argv[]) } opal_output(0, "%s RUNNING", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - opal_event_dispatch(opal_event_base); + opal_event_dispatch(orte_event_base); orte_finalize(); diff --git a/orte/test/system/test-time.c b/orte/test/system/test-time.c index 561910d068..7fc8b6f5ff 100644 --- a/orte/test/system/test-time.c +++ b/orte/test/system/test-time.c @@ -70,13 +70,13 @@ int main(int argc, char **argv) for (i = 0; i < NEVENT; i++) { /* Initalize one event */ ev[i] = (opal_event_t*)malloc(sizeof(opal_event_t)); - opal_event_evtimer_set(opal_event_base, ev[i], time_cb, ev[i]); + opal_event_evtimer_set(orte_event_base, ev[i], time_cb, ev[i]); tv.tv_sec = 0; tv.tv_usec = rand_int(50000); opal_event_evtimer_add(ev[i], &tv); } - opal_event_dispatch(opal_event_base); + opal_event_dispatch(orte_event_base); opal_finalize(); return (called < NEVENT); diff --git a/orte/threads/Makefile.am b/orte/threads/Makefile.am deleted file mode 100644 index bb05503ce7..0000000000 --- a/orte/threads/Makefile.am +++ /dev/null @@ -1,20 +0,0 @@ -# -*- makefile -*- -# -# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -# This makefile.am does not stand on its own - it is included from orte/Makefile.am - -# Source code files -headers += \ - threads/condition.h \ - threads/mutex.h \ - threads/threads.h - -libopen_rte_la_SOURCES += \ - threads/thread.c diff --git a/orte/threads/condition.h b/orte/threads/condition.h deleted file mode 100644 index f74073e321..0000000000 --- a/orte/threads/condition.h +++ /dev/null @@ -1,199 +0,0 @@ -/* - * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2007 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -#ifndef ORTE_CONDITION_H -#define ORTE_CONDITION_H - -#include "orte_config.h" -#ifdef HAVE_SYS_TIME_H -#include -#endif -#ifdef HAVE_TIME_H -#include -#endif -#if OPAL_HAVE_POSIX_THREADS -#include -#elif OPAL_HAVE_SOLARIS_THREADS -#include -#include -#endif - -#include "opal/threads/threads.h" -#include "opal/runtime/opal_cr.h" - -#include "orte/runtime/orte_globals.h" - -BEGIN_C_DECLS - -static inline int orte_condition_wait(opal_condition_t *c, opal_mutex_t *m) -{ - int rc = 0; - c->c_waiting++; - -/* -#if OPAL_HAVE_POSIX_THREADS - if (orte_progress_threads_enabled) { - rc = pthread_cond_wait(&c->c_cond, &m->m_lock_pthread); - } else { -#endif -*/ - if (c->c_signaled) { - c->c_waiting--; - opal_mutex_unlock(m); - opal_progress(); - OPAL_CR_TEST_CHECKPOINT_READY_STALL(); - opal_mutex_lock(m); - return 0; - } - while (c->c_signaled == 0) { - opal_mutex_unlock(m); - opal_progress(); - OPAL_CR_TEST_CHECKPOINT_READY_STALL(); - opal_mutex_lock(m); - } -/* -#if OPAL_HAVE_POSIX_THREADS - } -#endif - */ - - c->c_signaled--; - c->c_waiting--; - return rc; -} - -#if OPAL_ENABLE_DEBUG -#define ORTE_CONDITION_WAIT(x, y) \ - do { \ - if (opal_debug_threads) { \ - opal_output(0, "Entering condition wait for %s at %s:%d", \ - (NULL == (x)->name) ? "NULL" : (x)->name, \ - __FILE__, __LINE__); \ - } \ - orte_condition_wait((x), (y)); \ - } while (0); -#else -#define ORTE_CONDITION_WAIT(x, y) orte_condition_wait(x, y) -#endif - -static inline int orte_condition_timedwait(opal_condition_t *c, - opal_mutex_t *m, - const struct timespec *abstime) -{ - int rc = 0; - struct timeval tv; - struct timeval absolute; - - c->c_waiting++; - -/* -#if OPAL_HAVE_POSIX_THREADS - if (orte_progress_threads_enabled) { - rc = pthread_cond_timedwait(&c->c_cond, &m->m_lock_pthread, abstime); - } else { -#endif -*/ - absolute.tv_sec = abstime->tv_sec; - absolute.tv_usec = abstime->tv_nsec * 1000; - gettimeofday(&tv,NULL); - if (c->c_signaled == 0) { - do { - opal_mutex_unlock(m); - opal_progress(); - gettimeofday(&tv,NULL); - opal_mutex_lock(m); - } while (c->c_signaled == 0 && - (tv.tv_sec <= absolute.tv_sec || - (tv.tv_sec == absolute.tv_sec && tv.tv_usec < absolute.tv_usec))); - } -/* -#if OPAL_HAVE_POSIX_THREADS - } -#endif -*/ - if (c->c_signaled != 0) c->c_signaled--; - c->c_waiting--; - return rc; -} - -static inline int orte_condition_signal(opal_condition_t *c) -{ - if (c->c_waiting) { - c->c_signaled++; -/* -#if OPAL_HAVE_POSIX_THREADS - if (orte_progress_threads_enabled) { - pthread_cond_signal(&c->c_cond); - } -#endif -*/ - } - return 0; -} - -#if OPAL_ENABLE_DEBUG -#define ORTE_CONDITION_SIGNAL(x) \ - do { \ - if (opal_debug_threads) { \ - opal_output(0, "Signaling condition %s at %s:%d", \ - (NULL == (x)->name) ? "NULL" : (x)->name, \ - __FILE__, __LINE__); \ - } \ - orte_condition_signal((x)); \ - } while(0); -#else -#define ORTE_CONDITION_SIGNAL(x) orte_condition_signal(x) -#endif - -static inline int orte_condition_broadcast(opal_condition_t *c) -{ - c->c_signaled = c->c_waiting; - /* -#if OPAL_HAVE_POSIX_THREADS - if (orte_progress_threads_enabled) { - if( 1 == c->c_waiting ) { - pthread_cond_signal(&c->c_cond); - } else { - pthread_cond_broadcast(&c->c_cond); - } - } -#endif - */ - return 0; -} - -#if OPAL_ENABLE_DEBUG -#define ORTE_CONDITION_BROADCAST(x) \ - do { \ - if (opal_debug_threads) { \ - opal_output(0, "Broadcasting condition %s at %s:%d", \ - (NULL == (x)->name) ? "NULL" : (x)->name, \ - __FILE__, __LINE__); \ - } \ - orte_condition_broadcast((x)); \ - } while(0); -#else -#define ORTE_CONDITION_BROADCAST(x) orte_condition_broadcast(x) -#endif - -END_C_DECLS - -#endif - diff --git a/orte/threads/mutex.h b/orte/threads/mutex.h deleted file mode 100644 index 3b5e7ce7d1..0000000000 --- a/orte/threads/mutex.h +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. - * - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef ORTE_MUTEX_H -#define ORTE_MUTEX_H - -#include "orte_config.h" - -#include "opal/sys/atomic.h" -#include "opal/threads/mutex.h" -#if OPAL_ENABLE_DEBUG -#include "opal/util/output.h" -#endif - -BEGIN_C_DECLS - -/* Lock a mutex */ -#define ORTE_THREAD_LOCK(mutex) opal_mutex_lock(mutex) - -/** - * Try to lock a mutex - * Returns 0 if mutex was locked, non-zero otherwise. - */ -#define ORTE_THREAD_TRYLOCK(mutex) opal_mutex_trylock(mutex) - -/** Unlock a mutex */ -#define ORTE_THREAD_UNLOCK(mutex) opal_mutex_unlock(mutex) - - -/* Lock a mutex */ -#define ORTE_THREAD_SCOPED_LOCK(mutex, action) \ - do { \ - opal_mutex_lock(mutex); \ - (action); \ - opal_mutex_unlock(mutex); \ - } while (0) - -/* Use an atomic operation for increment/decrement */ - -#define ORTE_THREAD_ADD32(x,y) opal_atomic_add_32(x,y) - -#define ORTE_THREAD_ADD64(x,y) opal_atomic_add_64(x,y) - -#define ORTE_THREAD_ADD_SIZE_T(x,y) opal_atomic_add_size_t(x,y) - -#define ORTE_CMPSET(x, y, z) ((*(x) == (y)) ? ((*(x) = (z)), 1) : 0) - -#if OPAL_HAVE_ATOMIC_CMPSET_32 -#define ORTE_ATOMIC_CMPSET_32(x, y, z) opal_atomic_cmpset_32(x, y, z) -# endif - -# if OPAL_HAVE_ATOMIC_CMPSET_64 -#define ORTE_ATOMIC_CMPSET_64(x, y, z) opal_atomic_cmpset_64(x, y, z) -#endif - -#if OPAL_HAVE_ATOMIC_CMPSET_32 || OPAL_HAVE_ATOMIC_CMPSET_64 -#define ORTE_ATOMIC_CMPSET(x, y, z) opal_atomic_cmpset(x, y, z) -#endif - -END_C_DECLS - -#endif /* ORTE_MUTEX_H */ diff --git a/orte/threads/thread.c b/orte/threads/thread.c deleted file mode 100644 index 239bbb9c97..0000000000 --- a/orte/threads/thread.c +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "orte_config.h" - -#include "opal/mca/event/event.h" - -#include "orte/threads/threads.h" - -static void constructor(orte_thread_ctl_t *ptr) -{ - OBJ_CONSTRUCT(&ptr->lock, opal_mutex_t); - OBJ_CONSTRUCT(&ptr->cond, opal_condition_t); - ptr->active = false; - ptr->running = false; - ptr->stop = false; - ptr->name = NULL; - /* default to updating the global base */ - ptr->evbase = opal_event_base; -} -static void destructor(orte_thread_ctl_t *ptr) -{ - OBJ_DESTRUCT(&ptr->lock); - OBJ_DESTRUCT(&ptr->cond); - if (NULL != ptr->name) { - free(ptr->name); - } -} -OBJ_CLASS_INSTANCE(orte_thread_ctl_t, - opal_object_t, - constructor, destructor); diff --git a/orte/threads/threads.h b/orte/threads/threads.h deleted file mode 100644 index a8ed1db437..0000000000 --- a/orte/threads/threads.h +++ /dev/null @@ -1,131 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2006 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef ORTE_THREAD_H -#define ORTE_THREAD_H - -#include "orte_config.h" - -#include "opal/class/opal_object.h" -#if OPAL_ENABLE_DEBUG -#include "opal/util/output.h" -#endif -#include "opal/util/fd.h" -#include "opal/mca/event/event.h" - -#include "mutex.h" -#include "condition.h" - -BEGIN_C_DECLS - -typedef struct { - opal_object_t super; - opal_mutex_t lock; - opal_condition_t cond; - volatile bool active; - volatile bool running; - volatile bool stop; - opal_event_base_t *evbase; - char *name; -} orte_thread_ctl_t; -ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_thread_ctl_t); - -#if OPAL_ENABLE_DEBUG -#define ORTE_ACQUIRE_THREAD(ctl) \ - do { \ - ORTE_THREAD_LOCK(&(ctl)->lock); \ - if (opal_debug_threads) { \ - opal_output(0, "Waiting for thread %s at %s:%d:%s", \ - (NULL == (ctl)->name) ? "NULL" : (ctl)->name, \ - __FILE__, __LINE__, \ - ((ctl)->active) ? "TRUE" : "FALSE"); \ - } \ - while ((ctl)->active) { \ - ORTE_CONDITION_WAIT(&(ctl)->cond, &(ctl)->lock); \ - } \ - if (opal_debug_threads) { \ - opal_output(0, "Thread %s acquired at %s:%d", \ - (NULL == (ctl)->name) ? "NULL" : (ctl)->name, \ - __FILE__, __LINE__); \ - } \ - (ctl)->active = true; \ - } while(0); -#else -#define ORTE_ACQUIRE_THREAD(ctl) \ - do { \ - ORTE_THREAD_LOCK(&(ctl)->lock); \ - while ((ctl)->active) { \ - ORTE_CONDITION_WAIT(&(ctl)->cond, &(ctl)->lock); \ - } \ - (ctl)->active = true; \ - } while(0); -#endif - - -#if OPAL_ENABLE_DEBUG -#define ORTE_RELEASE_THREAD(ctl) \ - do { \ - if (opal_debug_threads) { \ - opal_output(0, "Releasing thread %s at %s:%d", \ - (NULL == (ctl)->name) ? "NULL" : (ctl)->name, \ - __FILE__, __LINE__); \ - } \ - (ctl)->active = false; \ - ORTE_CONDITION_BROADCAST(&(ctl)->cond); \ - OPAL_UPDATE_EVBASE((ctl)->evbase, NULL, OPAL_EVENT_NOOP); \ - ORTE_THREAD_UNLOCK(&(ctl)->lock); \ - } while(0); -#else -#define ORTE_RELEASE_THREAD(ctl) \ - do { \ - (ctl)->active = false; \ - ORTE_CONDITION_BROADCAST(&(ctl)->cond); \ - OPAL_UPDATE_EVBASE((ctl)->evbase, NULL, OPAL_EVENT_NOOP); \ - ORTE_THREAD_UNLOCK(&(ctl)->lock); \ - } while(0); -#endif - -#if OPAL_ENABLE_DEBUG -#define ORTE_WAKEUP_THREAD(ctl) \ - do { \ - ORTE_THREAD_LOCK(&(ctl)->lock); \ - if (opal_debug_threads) { \ - opal_output(0, "Waking up thread %s at %s:%d", \ - (NULL == (ctl)->name) ? "NULL" : (ctl)->name, \ - __FILE__, __LINE__); \ - } \ - (ctl)->active = false; \ - ORTE_CONDITION_BROADCAST(&(ctl)->cond); \ - OPAL_UPDATE_EVBASE((ctl)->evbase, NULL, OPAL_EVENT_NOOP); \ - ORTE_THREAD_UNLOCK(&(ctl)->lock); \ - } while(0); -#else -#define ORTE_WAKEUP_THREAD(ctl) \ - do { \ - ORTE_THREAD_LOCK(&(ctl)->lock); \ - (ctl)->active = false; \ - ORTE_CONDITION_BROADCAST(&(ctl)->cond); \ - OPAL_UPDATE_EVBASE((ctl)->evbase, NULL, OPAL_EVENT_NOOP); \ - ORTE_THREAD_UNLOCK(&(ctl)->lock); \ - } while(0); -#endif - -END_C_DECLS - -#endif /* ORTE_THREAD_H */ diff --git a/orte/tools/Makefile.am b/orte/tools/Makefile.am index 79fb3d6d7f..b2b7db55a6 100644 --- a/orte/tools/Makefile.am +++ b/orte/tools/Makefile.am @@ -13,7 +13,7 @@ # Copyright (c) 2006-2008 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2011 Los Alamos National Security, LLC. All rights # reserved. -# $COPYRIGHT$ + # $COPYRIGHT$ # # Additional copyrights may follow # diff --git a/orte/tools/orte-info/components.c b/orte/tools/orte-info/components.c index f760d7aba2..1bc4f79fb8 100644 --- a/orte/tools/orte-info/components.c +++ b/orte/tools/orte-info/components.c @@ -66,6 +66,8 @@ #include "orte/mca/ess/base/base.h" #include "orte/util/show_help.h" #include "orte/util/proc_info.h" +#include "orte/mca/state/state.h" +#include "orte/mca/state/base/base.h" #if !ORTE_DISABLE_FULL_SUPPORT #include "orte/mca/notifier/notifier.h" #include "orte/mca/notifier/base/base.h" @@ -239,9 +241,7 @@ void orte_info_open_components(void) map->components = &opal_memory_base_components_opened; opal_pointer_array_add(&component_map, map); - if (OPAL_SUCCESS != opal_event_base_open()) { - goto error; - } + /* the event framework is already open - just get its components */ map = OBJ_NEW(orte_info_component_map_t); map->type = strdup("event"); map->components = &opal_event_components; @@ -338,6 +338,14 @@ void orte_info_open_components(void) */ orte_process_info.proc_type = ORTE_PROC_HNP; + if (ORTE_SUCCESS != orte_state_base_open()) { + goto error; + } + map = OBJ_NEW(orte_info_component_map_t); + map->type = strdup("state"); + map->components = &orte_state_base_components_available; + opal_pointer_array_add(&component_map, map); + if (ORTE_SUCCESS != orte_errmgr_base_open()) { goto error; } @@ -527,7 +535,8 @@ void orte_info_close_components() (void) mca_oob_base_close(); #endif (void) orte_errmgr_base_close(); - + (void) orte_state_base_close(); + (void) opal_backtrace_base_close(); (void) opal_memory_base_close(); (void) opal_memchecker_base_close(); diff --git a/orte/tools/orte-info/orte-info.c b/orte/tools/orte-info/orte-info.c index e48d692cf4..b3b8e561f7 100644 --- a/orte/tools/orte-info/orte-info.c +++ b/orte/tools/orte-info/orte-info.c @@ -235,6 +235,7 @@ int main(int argc, char *argv[]) opal_pointer_array_add(&mca_types, "filem"); #endif /* these are always included */ + opal_pointer_array_add(&mca_types, "state"); opal_pointer_array_add(&mca_types, "errmgr"); opal_pointer_array_add(&mca_types, "ess"); opal_pointer_array_add(&mca_types, "grpcomm"); diff --git a/orte/tools/orte-ps/orte-ps.c b/orte/tools/orte-ps/orte-ps.c index 1d190422b5..a222ba296f 100644 --- a/orte/tools/orte-ps/orte-ps.c +++ b/orte/tools/orte-ps/orte-ps.c @@ -637,8 +637,7 @@ static int pretty_print_vpids(orte_job_t *job) { len_node = 0, len_ckpt_s = 0, len_ckpt_r = 0, - len_ckpt_l = 0, - len_epoch = 0; + len_ckpt_l = 0; int i, line_len; orte_vpid_t v; orte_proc_t *vpid; @@ -664,7 +663,6 @@ static int pretty_print_vpids(orte_job_t *job) { len_ckpt_r = -3; len_ckpt_l = -3; #endif - len_epoch = 6; for(v=0; v < job->num_procs; v++) { char *rankstr; @@ -882,9 +880,6 @@ static int gather_vpid_info(orte_ps_mpirun_info_t *hnpinfo) { if (ORTE_SUCCESS != (ret = orte_util_comm_query_proc_info(&(hnpinfo->hnp->name), job->jobid, ORTE_VPID_WILDCARD, -#if ORTE_ENABLE_EPOCH - ORTE_EPOCH_WILDCARD, -#endif &cnt, &procs))) { ORTE_ERROR_LOG(ret); diff --git a/orte/tools/orte-top/orte-top.c b/orte/tools/orte-top/orte-top.c index 74f5696fe7..62ce49bed0 100644 --- a/orte/tools/orte-top/orte-top.c +++ b/orte/tools/orte-top/orte-top.c @@ -10,7 +10,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2007-2012 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2007 Los Alamos National Security, LLC. All rights + * Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ * @@ -180,24 +180,9 @@ static void send_cmd(int fd, short dummy, void *arg) num_recvd = 0; if (0 > (ret = orte_rml.send_buffer(&(target_hnp->name), &cmdbuf, ORTE_RML_TAG_DAEMON, 0))) { ORTE_ERROR_LOG(ret); - orte_quit(); + orte_quit(0,0,NULL); return; } - - ORTE_PROGRESSED_WAIT(all_recvd, 0, 1); - - /* flag that field sizes are set */ - fields_set = true; - - /* pretty-print what we got */ - pretty_print(); - - /* see if we want to do it again */ - if (0 < update_rate) { - ORTE_TIMER_EVENT(update_rate, 0, send_cmd); - } else { - orte_quit(); - } } int @@ -282,10 +267,10 @@ main(int argc, char *argv[]) * forward, we need to abort in a manner that allows us * to cleanup */ - opal_event_signal_set(opal_event_base, &term_handler, SIGTERM, + opal_event_signal_set(orte_event_base, &term_handler, SIGTERM, abort_exit_callback, &term_handler); opal_event_signal_add(&term_handler, NULL); - opal_event_signal_set(opal_event_base, &int_handler, SIGINT, + opal_event_signal_set(orte_event_base, &int_handler, SIGINT, abort_exit_callback, &int_handler); opal_event_signal_add(&int_handler, NULL); @@ -484,7 +469,6 @@ main(int argc, char *argv[]) if (NULL == ranks) { /* take all ranks */ proc.vpid = ORTE_VPID_WILDCARD; - ORTE_EPOCH_SET(proc.epoch,ORTE_EPOCH_WILDCARD); if (ORTE_SUCCESS != (ret = opal_dss.pack(&cmdbuf, &proc, 1, ORTE_NAME))) { ORTE_ERROR_LOG(ret); goto cleanup; @@ -534,7 +518,9 @@ SEND: send_cmd(0, 0, NULL); /* now wait until the termination event fires */ - opal_event_dispatch(opal_event_base); + while (orte_event_base_active) { + opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE); + } /*************** * Cleanup @@ -576,14 +562,13 @@ static void abort_exit_callback(int fd, short ign, void *arg) fclose(fp); } ORTE_UPDATE_EXIT_STATUS(1); - orte_quit(); + orte_quit(0,0,NULL); } -static void process_stats(int fd, short event, void *data) +static void recv_stats(int status, orte_process_name_t* sender, + opal_buffer_t *buffer, orte_rml_tag_t tag, + void* cbdata) { - orte_message_event_t *mev = (orte_message_event_t*)data; - opal_buffer_t *buffer = mev->buffer; - orte_process_name_t *sender = &(mev->sender); int32_t n; opal_pstats_t *stats; orte_process_name_t proc; @@ -704,14 +689,24 @@ static void process_stats(int fd, short event, void *data) /* add it to the list */ opal_list_append(&recvd_stats, &stats->super); } - -cleanup: - OBJ_RELEASE(mev); - + + cleanup: /* check for completion */ num_recvd++; if (num_replies <= num_recvd) { - all_recvd = true; + /* flag that field sizes are set */ + fields_set = true; + + /* pretty-print what we got */ + pretty_print(); + + /* see if we want to do it again */ + if (0 < update_rate) { + ORTE_TIMER_EVENT(update_rate, 0, send_cmd, ORTE_SYS_PRI); + } else { + orte_finalize(); + exit(0); + } } /* repost the receive */ @@ -722,26 +717,6 @@ cleanup: } } -static void recv_stats(int status, orte_process_name_t* sender, - opal_buffer_t *buffer, orte_rml_tag_t tag, - void* cbdata) -{ - /* don't process this right away - we need to get out of the recv before - * we process the message as it may ask us to do something that involves - * more messaging! Instead, setup an event so that the message gets processed - * as soon as we leave the recv. - * - * The macro makes a copy of the buffer, which we release when processed - the incoming - * buffer, however, is NOT released here, although its payload IS transferred - * to the message buffer for later processing - */ - ORTE_MESSAGE_EVENT(sender, buffer, tag, process_stats); - - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, - "%s recv_stats: reissued recv", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); -} - /* static values needed for printing */ static int lennode = 0; static int lenrank = 0; diff --git a/orte/tools/orterun/Makefile.am b/orte/tools/orterun/Makefile.am index 61d77335ae..86194fd299 100644 --- a/orte/tools/orterun/Makefile.am +++ b/orte/tools/orterun/Makefile.am @@ -52,7 +52,7 @@ orterun_SOURCES = \ main.c \ orterun.c \ orterun.h - + orterun_LDADD = $(top_builddir)/orte/libopen-rte.la endif # !ORTE_DISABLE_FULL_SUPPORT diff --git a/orte/tools/orterun/orterun.c b/orte/tools/orterun/orterun.c index d905b7c170..4076cfc878 100644 --- a/orte/tools/orterun/orterun.c +++ b/orte/tools/orterun/orterun.c @@ -96,6 +96,7 @@ #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/errmgr/base/errmgr_private.h" #include "orte/mca/grpcomm/grpcomm.h" +#include "orte/mca/state/state.h" #include "orte/runtime/runtime.h" #include "orte/runtime/orte_globals.h" @@ -127,7 +128,7 @@ int MPIR_force_to_main = 0; #if !defined(__WINDOWS__) static void orte_debugger_dump(void); static void orte_debugger_init_before_spawn(orte_job_t *jdata); -static void orte_debugger_init_after_spawn(orte_job_t *jdata); +static void orte_debugger_init_after_spawn(int fd, short event, void *arg); static void attach_debugger(int fd, short event, void *arg); static void build_debugger_args(orte_app_context_t *debugger); static void open_fifo (void); @@ -536,7 +537,6 @@ int orterun(int argc, char *argv[]) char *tmp_env_var = NULL; char *param; orte_job_t *daemons; - int32_t ljob; orte_app_context_t *app, *dapp; orte_job_t *jdata=NULL; @@ -746,7 +746,6 @@ int orterun(int argc, char *argv[]) /* cannot call ORTE_ERROR_LOG as it could be the errmgr * never got loaded! */ - fprintf(stderr, "FAILED ORTE INIT\n"); return rc; } /* finalize the OPAL utils. As they are opened again from orte_init->opal_init @@ -754,6 +753,7 @@ int orterun(int argc, char *argv[]) */ opal_finalize_util(); + /* get the daemon job object */ daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); @@ -868,7 +868,7 @@ int orterun(int argc, char *argv[]) * to receive it too */ rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON, - ORTE_RML_NON_PERSISTENT, orte_daemon_recv, NULL); + ORTE_RML_PERSISTENT, orte_daemon_recv, NULL); if (rc != ORTE_SUCCESS && rc != ORTE_ERR_NOT_IMPLEMENTED) { ORTE_ERROR_LOG(rc); ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); @@ -918,46 +918,33 @@ int orterun(int argc, char *argv[]) } } - /* we may need to look at the apps for the user's job - * to get our full list of nodes, so prep the job for - * launch - start by getting a jobid for it */ - if (ORTE_SUCCESS != (rc = orte_plm_base_create_jobid(jdata))) { - ORTE_ERROR_LOG(rc); - goto DONE; - } - - /* store it on the global job data pool - this is the key - * step required before we launch the daemons. It allows - * the orte_rmaps_base_setup_virtual_machine routine to - * search all apps for any hosts to be used by the vm - */ - ljob = ORTE_LOCAL_JOBID(jdata->jobid); - opal_pointer_array_set_item(orte_job_data, ljob, jdata); - #if !defined(__WINDOWS__) /* setup for debugging */ orte_debugger_init_before_spawn(jdata); + orte_state.add_job_state(ORTE_JOB_STATE_READY_FOR_DEBUGGERS, + orte_debugger_init_after_spawn, + ORTE_SYS_PRI); #endif /* spawn the job and its daemons */ rc = orte_plm.spawn(jdata); -#if !defined(__WINDOWS__) - /* complete debugger interface */ - orte_debugger_init_after_spawn(jdata); -#endif + /* loop the event lib until an exit event is detected */ + while (orte_event_base_active) { + opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE); + } - /* now wait until the termination event fires */ - opal_event_dispatch(opal_event_base); - - /* we only reach this point by jumping there due - * to an error - so just cleanup and leave - */ DONE: + /* update the exit status, in case it wasn't done */ ORTE_UPDATE_EXIT_STATUS(orte_exit_status); - orte_quit(); - return orte_exit_status; + /* cleanup and leave */ + orte_finalize(); + + if (orte_debug_flag) { + fprintf(stderr, "exiting with status %d\n", orte_exit_status); + } + exit(orte_exit_status); } static int init_globals(void) @@ -2380,7 +2367,6 @@ static void run_debugger(char *basename, opal_cmd_line_t *cmd_line, static void attach_debugger(int fd, short event, void *arg); static void build_debugger_args(orte_app_context_t *debugger); static void open_fifo(void); -static opal_event_t attach; static int attach_fd = -1; static bool fifo_active=false; #define DUMP_INT(X) fprintf(stderr, " %s = %d\n", # X, X); @@ -2452,7 +2438,7 @@ static void orte_debugger_init_before_spawn(orte_job_t *jdata) "%s Setting debugger attach check rate for %d seconds", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), orte_debugger_check_rate); - ORTE_TIMER_EVENT(orte_debugger_check_rate, 0, attach_debugger); + ORTE_TIMER_EVENT(orte_debugger_check_rate, 0, attach_debugger, ORTE_SYS_PRI); } else if (orte_create_session_dirs) { /* create the attachment FIFO and setup readevent - cannot be * done if no session dirs exist! @@ -2539,8 +2525,10 @@ static void orte_debugger_init_before_spawn(orte_job_t *jdata) * that attaches to us post-launch of the application can get a * completed proctable */ -static void orte_debugger_init_after_spawn(orte_job_t *jdata) +void orte_debugger_init_after_spawn(int fd, short event, void *cbdata) { + orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; + orte_job_t *jdata = caddy->jdata; orte_proc_t *proc; orte_app_context_t *appctx; orte_vpid_t i, j; @@ -2557,6 +2545,7 @@ static void orte_debugger_init_after_spawn(orte_job_t *jdata) opal_output_verbose(5, orte_debug_output, "%s: debugger already initialized or zero procs", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); + OBJ_RELEASE(caddy); return; } @@ -2576,6 +2565,7 @@ static void orte_debugger_init_after_spawn(orte_job_t *jdata) MPIR_proctable_size); if (MPIR_proctable == NULL) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + OBJ_RELEASE(caddy); return; } @@ -2623,9 +2613,10 @@ static void orte_debugger_init_after_spawn(orte_job_t *jdata) /* wait for all procs to have reported their contact info - this * ensures that (a) they are all into mpi_init, and (b) the system * has the contact info to successfully send a message to rank=0 - */ + * ORTE_PROGRESSED_WAIT(false, jdata->num_reported, jdata->num_procs); - + */ + MPIR_Breakpoint(); /* send a message to rank=0 to release it */ @@ -2637,10 +2628,14 @@ static void orte_debugger_init_after_spawn(orte_job_t *jdata) } OBJ_DESTRUCT(&buf); } + + OBJ_RELEASE(caddy); } static void open_fifo (void) { + opal_event_t *attach; + if (attach_fd > 0) { close(attach_fd); } @@ -2655,10 +2650,11 @@ static void open_fifo (void) "%s Monitoring debugger attach fifo %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), MPIR_attach_fifo); - opal_event_set(opal_event_base, &attach, attach_fd, OPAL_EV_READ, attach_debugger, NULL); + attach = (opal_event_t*)malloc(sizeof(opal_event_t)); + opal_event_set(orte_event_base, attach, attach_fd, OPAL_EV_READ, attach_debugger, attach); fifo_active = true; - opal_event_add(&attach, 0); + opal_event_add(attach, 0); } static void attach_debugger(int fd, short event, void *arg) @@ -2672,27 +2668,40 @@ static void attach_debugger(int fd, short event, void *arg) orte_node_t *node; orte_proc_t *proc; orte_vpid_t vpid=0; + orte_timer_t *tm; + opal_event_t *attach; - /* read the file descriptor to clear that event, if necessary */ if (fifo_active) { - opal_event_del(&attach); + attach = (opal_event_t*)arg; fifo_active = false; rc = read(attach_fd, &fifo_cmd, sizeof(fifo_cmd)); if (!rc) { + /* release the current event */ + opal_event_free(attach); /* reopen device to clear hangup */ open_fifo(); return; } if (1 != fifo_cmd) { /* ignore the cmd */ - goto RELEASE; + fifo_active = true; + opal_event_add(attach, 0); + return; } } if (!MPIR_being_debugged && !orte_debugger_test_attach) { - /* false alarm */ - goto RELEASE; + /* false alarm - reset the read or timer event */ + if (0 == orte_debugger_check_rate) { + fifo_active = true; + opal_event_add(attach, 0); + } else if (!MPIR_being_debugged) { + tm = (orte_timer_t*)arg; + /* re-add the event */ + opal_event_evtimer_add(tm->ev, &tm->tv); + } + return; } opal_output_verbose(1, orte_debug_output, @@ -2781,7 +2790,6 @@ static void attach_debugger(int fd, short event, void *arg) proc = OBJ_NEW(orte_proc_t); proc->name.jobid = jdata->jobid; proc->name.vpid = vpid++; - ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN); /* set the local/node ranks - we don't actually care * what these are, but the odls needs them */ @@ -2814,9 +2822,11 @@ static void attach_debugger(int fd, short event, void *arg) /* reset the read or timer event */ if (0 == orte_debugger_check_rate) { fifo_active = true; - opal_event_add(&attach, 0); + opal_event_add(attach, 0); } else if (!MPIR_being_debugged) { - ORTE_TIMER_EVENT(orte_debugger_check_rate, 0, attach_debugger); + tm = (orte_timer_t*)arg; + /* re-add the event */ + opal_event_evtimer_add(tm->ev, &tm->tv); } /* notify the debugger that all is ready */ diff --git a/orte/util/comm/comm.c b/orte/util/comm/comm.c index ee27705850..152607ed6a 100644 --- a/orte/util/comm/comm.c +++ b/orte/util/comm/comm.c @@ -9,7 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2010 Los Alamos National Security, LLC. + * Copyright (c) 2010-2012 Los Alamos National Security, LLC. * All rights reserved. * $COPYRIGHT$ * @@ -27,6 +27,7 @@ #include "opal/util/output.h" #include "opal/threads/tsd.h" #include "opal/mca/event/event.h" +#include "opal/runtime/opal_progress.h" #include "opal/dss/dss.h" #include "orte/mca/errmgr/errmgr.h" @@ -50,8 +51,9 @@ static int error_exit; static void quicktime_cb(int fd, short event, void *cbdata) { + /* release the timer */ if (NULL != quicktime) { - free(quicktime); + opal_event_free(quicktime); quicktime = NULL; } error_exit = ORTE_ERR_SILENT; @@ -65,12 +67,13 @@ static void send_cbfunc(int status, orte_process_name_t* sender, { /* cancel the timer */ if (NULL != quicktime) { - opal_event_evtimer_del(quicktime); - free(quicktime); + opal_event_free(quicktime); quicktime = NULL; } /* declare the work done */ timer_fired = true; + /* release the message */ + OBJ_RELEASE(buffer); } static void recv_info(int status, orte_process_name_t* sender, @@ -81,8 +84,7 @@ static void recv_info(int status, orte_process_name_t* sender, /* cancel the timer */ if (NULL != quicktime) { - opal_event_evtimer_del(quicktime); - free (quicktime); + opal_event_free (quicktime); quicktime = NULL; } /* xfer the answer */ @@ -132,18 +134,19 @@ static bool step=false; int orte_util_comm_report_event(orte_comm_event_t ev) { int rc, i; - opal_buffer_t buf; + opal_buffer_t *buf; orte_node_t *node; - + struct timeval tv; + /* if nothing is connected, ignore this */ if (!tool_connected) { return ORTE_SUCCESS; } /* init a buffer for the data */ - OBJ_CONSTRUCT(&buf, opal_buffer_t); + buf = OBJ_NEW(opal_buffer_t); /* flag the type of event */ - opal_dss.pack(&buf, &ev, 1, ORTE_COMM_EVENT); + opal_dss.pack(buf, &ev, 1, ORTE_COMM_EVENT); switch (ev) { case ORTE_COMM_EVENT_ALLOCATE: @@ -152,7 +155,7 @@ int orte_util_comm_report_event(orte_comm_event_t ev) if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) { continue; } - opal_dss.pack(&buf, &node->name, 1, OPAL_STRING); + opal_dss.pack(buf, &node->name, 1, OPAL_STRING); } break; @@ -164,18 +167,36 @@ int orte_util_comm_report_event(orte_comm_event_t ev) default: ORTE_ERROR_LOG(ORTE_ERROR); - OBJ_DESTRUCT(&buf); + OBJ_RELEASE(buf); return ORTE_ERROR; break; } + /* define a max time to wait for send to complete */ + timer_fired = false; + error_exit = ORTE_SUCCESS; + quicktime = opal_event_alloc(); + tv.tv_sec = 0; + tv.tv_usec = 100000; + opal_event_evtimer_set(orte_event_base, quicktime, quicktime_cb, NULL); + opal_event_set_priority(quicktime, ORTE_ERROR_PRI); + opal_event_evtimer_add(quicktime, &tv); + /* do the send */ - if (0 > (rc = orte_rml.send_buffer(&tool, &buf, ORTE_RML_TAG_TOOL, 0))) { + if (0 > (rc = orte_rml.send_buffer_nb(&tool, buf, ORTE_RML_TAG_TOOL, 0, send_cbfunc, NULL))) { ORTE_ERROR_LOG(rc); - OBJ_DESTRUCT(&buf); + OBJ_RELEASE(buf); return rc; } + while (!timer_fired) { + opal_progress(); + } + + if (ORTE_SUCCESS != error_exit) { + return error_exit; + } + if (step) { /* the caller wants to wait until an ack is received - * define a max time to wait for an answer @@ -183,7 +204,6 @@ int orte_util_comm_report_event(orte_comm_event_t ev) OBJ_CONSTRUCT(&answer, opal_buffer_t); timer_fired = false; error_exit = ORTE_SUCCESS; - ORTE_DETECT_TIMEOUT(&quicktime, 100, 1000, 100000, quicktime_cb); /* get the answer */ if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, @@ -191,18 +211,21 @@ int orte_util_comm_report_event(orte_comm_event_t ev) ORTE_RML_NON_PERSISTENT, recv_info, NULL))) { - /* cancel the timer */ - if (NULL != quicktime) { - opal_event_evtimer_del(quicktime); - free(quicktime); - quicktime = NULL; - } ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&answer); return rc; } + /* set a timer for getting the answer */ + quicktime = opal_event_alloc(); + tv.tv_sec = 0; + tv.tv_usec = 100000; + opal_event_evtimer_set(orte_event_base, quicktime, quicktime_cb, NULL); + opal_event_set_priority(quicktime, ORTE_ERROR_PRI); + opal_event_evtimer_add(quicktime, &tv); - ORTE_PROGRESSED_WAIT(timer_fired, 0, 1); + while (!timer_fired) { + opal_progress(); + } /* cleanup */ OBJ_DESTRUCT(&answer); @@ -224,6 +247,7 @@ int orte_util_comm_query_job_info(const orte_process_name_t *hnp, orte_jobid_t j opal_buffer_t *cmd; orte_daemon_cmd_flag_t command = ORTE_DAEMON_REPORT_JOB_INFO_CMD; orte_job_t **job_info; + struct timeval tv; /* set default response */ *num_jobs = 0; @@ -241,28 +265,26 @@ int orte_util_comm_query_job_info(const orte_process_name_t *hnp, orte_jobid_t j OBJ_RELEASE(cmd); return ret; } - /* define a max time to wait for send to complete */ + + /* define a max time to wait for send to complete */ timer_fired = false; error_exit = ORTE_SUCCESS; - ORTE_DETECT_TIMEOUT(&quicktime, 100, 1000, 100000, quicktime_cb); - + quicktime = opal_event_alloc(); + tv.tv_sec = 0; + tv.tv_usec = 100000; + opal_event_evtimer_set(orte_event_base, quicktime, quicktime_cb, NULL); + opal_event_set_priority(quicktime, ORTE_ERROR_PRI); + opal_event_evtimer_add(quicktime, &tv); + /* do the send */ - if (0 > (ret = orte_rml.send_buffer_nb((orte_process_name_t*)hnp, cmd, ORTE_RML_TAG_DAEMON, 0, - send_cbfunc, NULL))) { + if (0 > (ret = orte_rml.send_buffer_nb((orte_process_name_t*)hnp, cmd, ORTE_RML_TAG_DAEMON, 0, send_cbfunc, NULL))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(cmd); return ret; } - /* wait for send to complete */ - ORTE_PROGRESSED_WAIT(timer_fired, 0, 1); - - /* release the buffer */ - OBJ_RELEASE(cmd); - - /* did it succeed? */ - if (ORTE_SUCCESS != error_exit) { - return error_exit; + while (!timer_fired) { + opal_progress(); } /* setup for answer */ @@ -271,7 +293,6 @@ int orte_util_comm_query_job_info(const orte_process_name_t *hnp, orte_jobid_t j /* define a max time to wait for an answer */ timer_fired = false; error_exit = ORTE_SUCCESS; - ORTE_DETECT_TIMEOUT(&quicktime, 100, 1000, 100000, quicktime_cb); /* get the answer */ if (ORTE_SUCCESS != (ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, @@ -279,18 +300,21 @@ int orte_util_comm_query_job_info(const orte_process_name_t *hnp, orte_jobid_t j ORTE_RML_NON_PERSISTENT, recv_info, NULL))) { - /* cancel the timer */ - if (NULL != quicktime) { - opal_event_evtimer_del(quicktime); - free(quicktime); - quicktime = NULL; - } ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&answer); return ret; } + /* set a timer for getting the answer */ + quicktime = opal_event_alloc(); + tv.tv_sec = 0; + tv.tv_usec = 100000; + opal_event_evtimer_set(orte_event_base, quicktime, quicktime_cb, NULL); + opal_event_set_priority(quicktime, ORTE_ERROR_PRI); + opal_event_evtimer_add(quicktime, &tv); - ORTE_PROGRESSED_WAIT(timer_fired, 0, 1); + while (!timer_fired) { + opal_progress(); + } if (ORTE_SUCCESS != error_exit) { OBJ_DESTRUCT(&answer); @@ -333,7 +357,8 @@ int orte_util_comm_query_node_info(const orte_process_name_t *hnp, char *node, opal_buffer_t *cmd; orte_daemon_cmd_flag_t command = ORTE_DAEMON_REPORT_NODE_INFO_CMD; orte_node_t **node_info; - + struct timeval tv; + /* set default response */ *num_nodes = 0; *node_info_array = NULL; @@ -350,24 +375,27 @@ int orte_util_comm_query_node_info(const orte_process_name_t *hnp, char *node, OBJ_RELEASE(cmd); return ret; } - /* define a max time to wait for send to complete */ + + /* define a max time to wait for send to complete */ timer_fired = false; error_exit = ORTE_SUCCESS; - ORTE_DETECT_TIMEOUT(&quicktime, 100, 1000, 100000, quicktime_cb); - + quicktime = opal_event_alloc(); + tv.tv_sec = 0; + tv.tv_usec = 100000; + opal_event_evtimer_set(orte_event_base, quicktime, quicktime_cb, NULL); + opal_event_set_priority(quicktime, ORTE_ERROR_PRI); + opal_event_evtimer_add(quicktime, &tv); + /* do the send */ - if (0 > (ret = orte_rml.send_buffer_nb((orte_process_name_t*)hnp, cmd, ORTE_RML_TAG_DAEMON, 0, - send_cbfunc, NULL))) { + if (0 > (ret = orte_rml.send_buffer_nb((orte_process_name_t*)hnp, cmd, ORTE_RML_TAG_DAEMON, 0, send_cbfunc, NULL))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(cmd); return ret; } - /* wait for send to complete */ - ORTE_PROGRESSED_WAIT(timer_fired, 0, 1); - - /* release the buffer */ - OBJ_RELEASE(cmd); + while (!timer_fired) { + opal_progress(); + } /* did it succeed? */ if (ORTE_SUCCESS != error_exit) { @@ -377,7 +405,6 @@ int orte_util_comm_query_node_info(const orte_process_name_t *hnp, char *node, /* define a max time to wait for an answer */ timer_fired = false; error_exit = ORTE_SUCCESS; - ORTE_DETECT_TIMEOUT(&quicktime, 10, 1000, 10000, quicktime_cb); /* get the answer */ OBJ_CONSTRUCT(&answer, opal_buffer_t); @@ -386,18 +413,21 @@ int orte_util_comm_query_node_info(const orte_process_name_t *hnp, char *node, ORTE_RML_NON_PERSISTENT, recv_info, NULL))) { - /* cancel the timer */ - if (NULL != quicktime) { - opal_event_evtimer_del(quicktime); - free (quicktime); - quicktime = NULL; - } ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&answer); return ret; } + /* set a timer for getting the answer */ + quicktime = opal_event_alloc(); + tv.tv_sec = 0; + tv.tv_usec = 100000; + opal_event_evtimer_set(orte_event_base, quicktime, quicktime_cb, NULL); + opal_event_set_priority(quicktime, ORTE_ERROR_PRI); + opal_event_evtimer_add(quicktime, &tv); - ORTE_PROGRESSED_WAIT(timer_fired, 0, 1); + while (!timer_fired) { + opal_progress(); + } if (ORTE_SUCCESS != error_exit) { OBJ_DESTRUCT(&answer); @@ -432,19 +462,15 @@ int orte_util_comm_query_node_info(const orte_process_name_t *hnp, char *node, return ORTE_SUCCESS; } -#if ORTE_ENABLE_EPOCH -int orte_util_comm_query_proc_info(const orte_process_name_t *hnp, orte_jobid_t job, orte_vpid_t vpid, - orte_epoch_t epoch, int *num_procs, orte_proc_t ***proc_info_array) -#else int orte_util_comm_query_proc_info(const orte_process_name_t *hnp, orte_jobid_t job, orte_vpid_t vpid, int *num_procs, orte_proc_t ***proc_info_array) -#endif { int ret; int32_t cnt, cnt_procs, n; opal_buffer_t *cmd; orte_daemon_cmd_flag_t command = ORTE_DAEMON_REPORT_PROC_INFO_CMD; orte_proc_t **proc_info; + struct timeval tv; /* set default response */ *num_procs = 0; @@ -467,17 +493,16 @@ int orte_util_comm_query_proc_info(const orte_process_name_t *hnp, orte_jobid_t OBJ_RELEASE(cmd); return ret; } -#if ORTE_ENABLE_EPOCH - if (ORTE_SUCCESS != (ret = opal_dss.pack(cmd, &epoch, 1, ORTE_EPOCH))) { - ORTE_ERROR_LOG(ret); - OBJ_RELEASE(cmd); - return ret; - } -#endif + /* define a max time to wait for send to complete */ timer_fired = false; error_exit = ORTE_SUCCESS; - ORTE_DETECT_TIMEOUT(&quicktime, 100, 1000, 100000, quicktime_cb); + quicktime = opal_event_alloc(); + tv.tv_sec = 0; + tv.tv_usec = 100000; + opal_event_evtimer_set(orte_event_base, quicktime, quicktime_cb, NULL); + opal_event_set_priority(quicktime, ORTE_ERROR_PRI); + opal_event_evtimer_add(quicktime, &tv); /* do the send */ if (0 > (ret = orte_rml.send_buffer_nb((orte_process_name_t*)hnp, cmd, ORTE_RML_TAG_DAEMON, 0, @@ -487,11 +512,9 @@ int orte_util_comm_query_proc_info(const orte_process_name_t *hnp, orte_jobid_t return ret; } - /* wait for send to complete */ - ORTE_PROGRESSED_WAIT(timer_fired, 0, 1); - - /* release the buffer */ - OBJ_RELEASE(cmd); + while (!timer_fired) { + opal_progress(); + } /* did it succeed? */ if (ORTE_SUCCESS != error_exit) { @@ -501,7 +524,6 @@ int orte_util_comm_query_proc_info(const orte_process_name_t *hnp, orte_jobid_t /* define a max time to wait for an answer */ timer_fired = false; error_exit = ORTE_SUCCESS; - ORTE_DETECT_TIMEOUT(&quicktime, 10, 1000, 10000, quicktime_cb); /* get the answer */ OBJ_CONSTRUCT(&answer, opal_buffer_t); @@ -510,18 +532,21 @@ int orte_util_comm_query_proc_info(const orte_process_name_t *hnp, orte_jobid_t ORTE_RML_NON_PERSISTENT, recv_info, NULL))) { - /* cancel the timer */ - if (NULL != quicktime) { - opal_event_evtimer_del(quicktime); - free(quicktime); - quicktime = NULL; - } ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&answer); return ret; } + /* set a timer for getting the answer */ + quicktime = opal_event_alloc(); + tv.tv_sec = 0; + tv.tv_usec = 100000; + opal_event_evtimer_set(orte_event_base, quicktime, quicktime_cb, NULL); + opal_event_set_priority(quicktime, ORTE_ERROR_PRI); + opal_event_evtimer_add(quicktime, &tv); - ORTE_PROGRESSED_WAIT(timer_fired, 0, 1); + while (!timer_fired) { + opal_progress(); + } if (ORTE_SUCCESS != error_exit) { OBJ_DESTRUCT(&answer); diff --git a/orte/util/comm/comm.h b/orte/util/comm/comm.h index 52c655ad25..f40c23377d 100644 --- a/orte/util/comm/comm.h +++ b/orte/util/comm/comm.h @@ -52,9 +52,6 @@ ORTE_DECLSPEC int orte_util_comm_query_node_info(const orte_process_name_t *hnp, int *num_nodes, orte_node_t ***node_info_array); ORTE_DECLSPEC int orte_util_comm_query_proc_info(const orte_process_name_t *hnp, orte_jobid_t job, orte_vpid_t vpid, -#if ORTE_ENABLE_EPOCH - orte_epoch_t epoch, -#endif int *num_procs, orte_proc_t ***proc_info_array); ORTE_DECLSPEC int orte_util_comm_spawn_job(const orte_process_name_t *hnp, orte_job_t *jdata); diff --git a/orte/util/error_strings.c b/orte/util/error_strings.c index d785bb016a..828358e5ae 100644 --- a/orte/util/error_strings.c +++ b/orte/util/error_strings.c @@ -9,7 +9,9 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -31,6 +33,7 @@ #endif #endif +#include "orte/mca/plm/plm_types.h" #include "orte/util/error_strings.h" #include "orte/runtime/orte_globals.h" @@ -196,171 +199,135 @@ int orte_err2str(int errnum, const char **errmsg) const char *orte_job_state_to_str(orte_job_state_t state) { switch(state) { - case ORTE_JOB_STATE_UNDEF: - return "UNDEFINED"; - case ORTE_JOB_STATE_INIT: - return "INITIALIZED"; - case ORTE_JOB_STATE_RESTART: - return "RESTARTING"; - case ORTE_JOB_STATE_LAUNCHED: - return "LAUNCHED"; - case ORTE_JOB_STATE_RUNNING: - return "RUNNING"; - case ORTE_JOB_STATE_SUSPENDED: - return "SUSPENDED"; - case ORTE_JOB_STATE_REGISTERED: - return "SYNC REGISTERED"; - case ORTE_JOB_STATE_UNTERMINATED: - return "UNTERMINATED"; - case ORTE_JOB_STATE_TERMINATED: - return "NORMALLY TERMINATED"; - case ORTE_JOB_STATE_ABORTED: - return "ABORTED"; - case ORTE_JOB_STATE_FAILED_TO_START: - return "FAILED TO START"; - case ORTE_JOB_STATE_ABORTED_BY_SIG: - return "ABORTED BY SIGNAL"; - case ORTE_JOB_STATE_ABORTED_WO_SYNC: - return "TERMINATED WITHOUT SYNC"; - case ORTE_JOB_STATE_KILLED_BY_CMD: - return "KILLED BY INTERNAL COMMAND"; - case ORTE_JOB_STATE_COMM_FAILED: - return "COMMUNICATION FAILURE"; - case ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED: - return "SENSOR BOUND EXCEEDED"; - break; - case ORTE_JOB_STATE_NEVER_LAUNCHED: - return "NEVER LAUNCHED"; - case ORTE_JOB_STATE_ABORT_ORDERED: - return "ABORT IN PROGRESS"; - case ORTE_JOB_STATE_HEARTBEAT_FAILED: - return "HEARTBEAT FAILED"; - case ORTE_JOB_STATE_PROCS_MIGRATING: - return "PROCS MIGRATING"; - case ORTE_JOB_STATE_NON_ZERO_TERM: - return "AT LEAST ONE PROCESS EXITED WITH NON-ZERO STATUS"; - case ORTE_JOB_STATE_SILENT_ABORT: - return "ERROR REPORTED ELSEWHERE"; - default: - return "UNKNOWN STATE!"; + case ORTE_JOB_STATE_UNDEF: + return "UNDEFINED"; + case ORTE_JOB_STATE_INIT: + return "PENDING INIT"; + case ORTE_JOB_STATE_ALLOCATE: + return "PENDING ALLOCATION"; + case ORTE_JOB_STATE_MAP: + return "PENDING MAPPING"; + case ORTE_JOB_STATE_SYSTEM_PREP: + return "PENDING FINAL SYSTEM PREP"; + case ORTE_JOB_STATE_LAUNCH_DAEMONS: + return "PENDING DAEMON LAUNCH"; + case ORTE_JOB_STATE_DAEMONS_LAUNCHED: + return "DAEMONS LAUNCHED"; + case ORTE_JOB_STATE_DAEMONS_REPORTED: + return "ALL DAEMONS REPORTED"; + case ORTE_JOB_STATE_LAUNCH_APPS: + return "PENDING APP LAUNCH"; + case ORTE_JOB_STATE_RUNNING: + return "RUNNING"; + case ORTE_JOB_STATE_SUSPENDED: + return "SUSPENDED"; + case ORTE_JOB_STATE_REGISTERED: + return "SYNC REGISTERED"; + case ORTE_JOB_STATE_READY_FOR_DEBUGGERS: + return "READY FOR DEBUGGERS"; + case ORTE_JOB_STATE_LOCAL_LAUNCH_COMPLETE: + return "LOCAL LAUNCH COMPLETE"; + case ORTE_JOB_STATE_UNTERMINATED: + return "UNTERMINATED"; + case ORTE_JOB_STATE_TERMINATED: + return "NORMALLY TERMINATED"; + case ORTE_JOB_STATE_ALL_JOBS_COMPLETE: + return "ALL JOBS COMPLETE"; + case ORTE_JOB_STATE_ERROR: + return "ARTIFICIAL BOUNDARY - ERROR"; + case ORTE_JOB_STATE_KILLED_BY_CMD: + return "KILLED BY INTERNAL COMMAND"; + case ORTE_JOB_STATE_ABORTED: + return "ABORTED"; + case ORTE_JOB_STATE_FAILED_TO_START: + return "FAILED TO START"; + case ORTE_JOB_STATE_ABORTED_BY_SIG: + return "ABORTED BY SIGNAL"; + case ORTE_JOB_STATE_ABORTED_WO_SYNC: + return "TERMINATED WITHOUT SYNC"; + case ORTE_JOB_STATE_COMM_FAILED: + return "COMMUNICATION FAILURE"; + case ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED: + return "SENSOR BOUND EXCEEDED"; + case ORTE_JOB_STATE_CALLED_ABORT: + return "PROC CALLED ABORT"; + case ORTE_JOB_STATE_HEARTBEAT_FAILED: + return "HEARTBEAT FAILED"; + case ORTE_JOB_STATE_NEVER_LAUNCHED: + return "NEVER LAUNCHED"; + case ORTE_JOB_STATE_ABORT_ORDERED: + return "ABORT IN PROGRESS"; + case ORTE_JOB_STATE_NON_ZERO_TERM: + return "AT LEAST ONE PROCESS EXITED WITH NON-ZERO STATUS"; + case ORTE_JOB_STATE_FAILED_TO_LAUNCH: + return "FAILED TO LAUNCH"; + case ORTE_JOB_STATE_FORCED_EXIT: + return "FORCED EXIT"; + case ORTE_JOB_STATE_DAEMONS_TERMINATED: + return "DAEMONS TERMINATED"; + case ORTE_JOB_STATE_SILENT_ABORT: + return "ERROR REPORTED ELSEWHERE"; + case ORTE_JOB_STATE_ANY: + return "ANY"; + default: + return "UNKNOWN STATE!"; } } const char *orte_proc_state_to_str(orte_proc_state_t state) { switch(state) { - case ORTE_PROC_STATE_UNDEF: - return "UNDEFINED"; - case ORTE_PROC_STATE_INIT: - return "INITIALIZED"; - case ORTE_PROC_STATE_RESTART: - return "RESTARTING"; - case ORTE_PROC_STATE_LAUNCHED: - return "LAUNCHED"; - case ORTE_PROC_STATE_RUNNING: - return "RUNNING"; - case ORTE_PROC_STATE_REGISTERED: - return "SYNC REGISTERED"; - case ORTE_PROC_STATE_UNTERMINATED: - return "UNTERMINATED"; - case ORTE_PROC_STATE_TERMINATED: - return "NORMALLY TERMINATED"; - case ORTE_PROC_STATE_ABORTED: - return "ABORTED"; - case ORTE_PROC_STATE_FAILED_TO_START: - return "FAILED TO START"; - case ORTE_PROC_STATE_ABORTED_BY_SIG: - return "ABORTED BY SIGNAL"; - case ORTE_PROC_STATE_TERM_WO_SYNC: - return "TERMINATED WITHOUT SYNC"; - case ORTE_PROC_STATE_KILLED_BY_CMD: - return "KILLED BY INTERNAL COMMAND"; - case ORTE_PROC_STATE_COMM_FAILED: - return "COMMUNICATION FAILURE"; - case ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED: - return "SENSOR BOUND EXCEEDED"; - break; - case ORTE_PROC_STATE_HEARTBEAT_FAILED: - return "HEARTBEAT FAILED"; - break; - case ORTE_PROC_STATE_MIGRATING: - return "MIGRATING"; - case ORTE_PROC_STATE_CANNOT_RESTART: - return "CANNOT BE RESTARTED"; - case ORTE_PROC_STATE_TERM_NON_ZERO: - return "EXITED WITH NON-ZERO STATUS"; - case ORTE_PROC_STATE_RESTARTED: - return "RESTART DETECTED"; - default: - return "UNKNOWN STATE!"; - } -} - -const char *orte_proc_exit_code_to_signal(int exit_code) -{ - int signal; - - signal = exit_code - 128; - - switch(signal) { - case SIGHUP: - return "SIGHUP"; - case SIGINT: - return "SIGINT"; - case SIGQUIT: - return "SIGQUIT"; - case SIGILL: - return "SIGILL"; - case SIGTRAP: - return "SIGTRAP"; - case SIGABRT: - return "SIGABRT"; - case SIGFPE: - return "SIGFPE"; - case SIGKILL: - return "SIGKILL"; - case SIGBUS: - return "SIGBUS"; - case SIGSEGV: - return "SIGSEGV"; - case SIGPIPE: - return "SIGPIPE"; - case SIGALRM: - return "SIGALRM"; - case SIGTERM: - return "SIGTERM"; - case SIGURG: - return "SIGURG"; - case SIGSTOP: - return "SIGSTOP"; - case SIGTSTP: - return "SIGTSTP"; - case SIGCONT: - return "SIGCONT"; - case SIGCHLD: - return "SIGCHLD"; - case SIGTTIN: - return "SIGTTIN"; - case SIGTTOU: - return "SIGTTOU"; - case SIGIO: - return "SIGIO"; - case SIGXCPU: - return "SIGXCPU"; - case SIGXFSZ: - return "SIGXFSZ"; - case SIGVTALRM: - return "SIGVTALRM"; - case SIGPROF: - return "SIGPROF"; - case SIGWINCH: - return "SIGWINCH"; - case SIGUSR1: - return "SIGUSR1"; - case SIGUSR2: - return "SIGUSR2"; + case ORTE_PROC_STATE_UNDEF: + return "UNDEFINED"; + case ORTE_PROC_STATE_INIT: + return "INITIALIZED"; + case ORTE_PROC_STATE_RESTART: + return "RESTARTING"; + case ORTE_PROC_STATE_TERMINATE: + return "MARKED FOR TERMINATION"; + case ORTE_PROC_STATE_RUNNING: + return "RUNNING"; + case ORTE_PROC_STATE_REGISTERED: + return "SYNC REGISTERED"; + case ORTE_PROC_STATE_IOF_COMPLETE: + return "IOF COMPLETE"; + case ORTE_PROC_STATE_WAITPID_FIRED: + return "WAITPID FIRED"; + case ORTE_PROC_STATE_UNTERMINATED: + return "UNTERMINATED"; + case ORTE_PROC_STATE_TERMINATED: + return "NORMALLY TERMINATED"; + case ORTE_PROC_STATE_ERROR: + return "ARTIFICIAL BOUNDARY - ERROR"; + case ORTE_PROC_STATE_KILLED_BY_CMD: + return "KILLED BY INTERNAL COMMAND"; + case ORTE_PROC_STATE_ABORTED: + return "ABORTED"; + case ORTE_PROC_STATE_FAILED_TO_START: + return "FAILED TO START"; + case ORTE_PROC_STATE_ABORTED_BY_SIG: + return "ABORTED BY SIGNAL"; + case ORTE_PROC_STATE_TERM_WO_SYNC: + return "TERMINATED WITHOUT SYNC"; + case ORTE_PROC_STATE_COMM_FAILED: + return "COMMUNICATION FAILURE"; + case ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED: + return "SENSOR BOUND EXCEEDED"; + case ORTE_PROC_STATE_CALLED_ABORT: + return "CALLED ABORT"; + case ORTE_PROC_STATE_HEARTBEAT_FAILED: + return "HEARTBEAT FAILED"; + case ORTE_PROC_STATE_MIGRATING: + return "MIGRATING"; + case ORTE_PROC_STATE_CANNOT_RESTART: + return "CANNOT BE RESTARTED"; + case ORTE_PROC_STATE_TERM_NON_ZERO: + return "EXITED WITH NON-ZERO STATUS"; + case ORTE_PROC_STATE_FAILED_TO_LAUNCH: + return "FAILED TO LAUNCH"; + case ORTE_PROC_STATE_ANY: + return "ANY"; default: - return "UNRECOGNIZED"; + return "UNKNOWN STATE!"; } } - diff --git a/orte/util/error_strings.h b/orte/util/error_strings.h index ce8c9868ba..9ca5ca9656 100644 --- a/orte/util/error_strings.h +++ b/orte/util/error_strings.h @@ -39,7 +39,5 @@ ORTE_DECLSPEC const char *orte_job_state_to_str(orte_job_state_t state); ORTE_DECLSPEC const char *orte_proc_state_to_str(orte_proc_state_t state); -ORTE_DECLSPEC const char *orte_proc_exit_code_to_signal(int exit_code); - END_C_DECLS #endif diff --git a/orte/util/hnp_contact.c b/orte/util/hnp_contact.c index 147a90290c..398653be57 100644 --- a/orte/util/hnp_contact.c +++ b/orte/util/hnp_contact.c @@ -54,7 +54,6 @@ static void orte_hnp_contact_construct(orte_hnp_contact_t *ptr) { ptr->name.jobid = ORTE_JOBID_INVALID; ptr->name.vpid = ORTE_VPID_INVALID; - ORTE_EPOCH_SET(ptr->name.epoch,ORTE_EPOCH_MIN); ptr->rml_uri = NULL; } diff --git a/orte/util/name_fns.c b/orte/util/name_fns.c index 70933b5795..5e900beca2 100644 --- a/orte/util/name_fns.c +++ b/orte/util/name_fns.c @@ -45,7 +45,6 @@ static void orte_namelist_construct(orte_namelist_t* list) { list->name.jobid = ORTE_JOBID_INVALID; list->name.vpid = ORTE_VPID_INVALID; - ORTE_EPOCH_SET(list->name.epoch,ORTE_EPOCH_MIN); } /* destructor - used to free any resources held by instance */ @@ -116,9 +115,6 @@ char* orte_util_print_name_args(const orte_process_name_t *name) { orte_print_args_buffers_t *ptr; char *job, *vpid; -#if ORTE_ENABLE_EPOCH - char *epoch; -#endif /* protect against NULL names */ if (NULL == name) { @@ -136,14 +132,13 @@ char* orte_util_print_name_args(const orte_process_name_t *name) return ptr->buffers[ptr->cntr-1]; } - /* get the jobid, vpid, and epoch strings first - this will protect us from + /* get the jobid, vpid strings first - this will protect us from * stepping on each other's buffer. This also guarantees * that the print_args function has been initialized, so * we don't need to duplicate that here */ job = orte_util_print_jobids(name->jobid); vpid = orte_util_print_vpids(name->vpid); - ORTE_EPOCH_SET(epoch,orte_util_print_epoch(name->epoch)); /* get the next buffer */ ptr = get_print_name_buffer(); @@ -158,15 +153,9 @@ char* orte_util_print_name_args(const orte_process_name_t *name) ptr->cntr = 0; } -#if ORTE_ENABLE_EPOCH - snprintf(ptr->buffers[ptr->cntr++], - ORTE_PRINT_NAME_ARGS_MAX_SIZE, - "[%s,%s,%s]", job, vpid, epoch); -#else snprintf(ptr->buffers[ptr->cntr++], ORTE_PRINT_NAME_ARGS_MAX_SIZE, "[%s,%s]", job, vpid); -#endif return ptr->buffers[ptr->cntr-1]; } @@ -290,36 +279,6 @@ char* orte_util_print_vpids(const orte_vpid_t vpid) return ptr->buffers[ptr->cntr-1]; } -#if ORTE_ENABLE_EPOCH -char* orte_util_print_epoch(const orte_epoch_t epoch) -{ - orte_print_args_buffers_t *ptr; - - ptr = get_print_name_buffer(); - - if (NULL == ptr) { - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - return orte_print_args_null; - } - - /* cycle around the ring */ - if (ORTE_PRINT_NAME_ARG_NUM_BUFS == ptr->cntr) { - ptr->cntr = 0; - } - - if (ORTE_EPOCH_INVALID == epoch) { - snprintf(ptr->buffers[ptr->cntr++], ORTE_PRINT_NAME_ARGS_MAX_SIZE, "INVALID"); - } else if (ORTE_EPOCH_WILDCARD == epoch) { - snprintf(ptr->buffers[ptr->cntr++], ORTE_PRINT_NAME_ARGS_MAX_SIZE, "WILDCARD"); - } else { - snprintf(ptr->buffers[ptr->cntr++], - ORTE_PRINT_NAME_ARGS_MAX_SIZE, - "%ld", (long)epoch); - } - return ptr->buffers[ptr->cntr-1]; -} -#endif - /*** STRING FUNCTIONS ***/ @@ -413,70 +372,17 @@ int orte_util_convert_string_to_vpid(orte_vpid_t *vpid, const char* vpidstring) return ORTE_SUCCESS; } -#if ORTE_ENABLE_EPOCH -int orte_util_convert_epoch_to_string(char **epoch_string, const orte_epoch_t epoch) -{ - /* check for wildcard value - handle appropriately */ - if (ORTE_EPOCH_WILDCARD == epoch) { - *epoch_string = strdup(ORTE_SCHEMA_WILDCARD_STRING); - return ORTE_SUCCESS; - } - - /* check for invalid value - handle appropriately */ - if (ORTE_EPOCH_INVALID == epoch) { - *epoch_string = strdup(ORTE_SCHEMA_INVALID_STRING); - return ORTE_SUCCESS; - } - - if (0 > asprintf(epoch_string, "%ld", (long) epoch)) { - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - return ORTE_ERR_OUT_OF_RESOURCE; - } - - return ORTE_SUCCESS; -} - -int orte_util_convert_string_to_epoch(orte_epoch_t *epoch, const char* epoch_string) -{ - if (NULL == epoch_string) { /* got an error */ - ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); - *epoch = ORTE_EPOCH_INVALID; - return ORTE_ERR_BAD_PARAM; - } - - /** check for wildcard character - handle appropriately */ - if (0 == strcmp(ORTE_SCHEMA_WILDCARD_STRING, epoch_string)) { - *epoch = ORTE_EPOCH_WILDCARD; - return ORTE_SUCCESS; - } - - /* check for invalid value */ - if (0 == strcmp(ORTE_SCHEMA_INVALID_STRING, epoch_string)) { - *epoch = ORTE_EPOCH_INVALID; - return ORTE_SUCCESS; - } - - *epoch = strtol(epoch_string, NULL, 10); - - return ORTE_SUCCESS; -} -#endif - int orte_util_convert_string_to_process_name(orte_process_name_t *name, const char* name_string) { char *temp, *token; orte_jobid_t job; orte_vpid_t vpid; -#if ORTE_ENABLE_EPOCH - orte_epoch_t epoch; -#endif int return_code=ORTE_SUCCESS; /* set default */ name->jobid = ORTE_JOBID_INVALID; name->vpid = ORTE_VPID_INVALID; - ORTE_EPOCH_SET(name->epoch,ORTE_EPOCH_MIN); /* check for NULL string - error */ if (NULL == name_string) { @@ -523,30 +429,8 @@ int orte_util_convert_string_to_process_name(orte_process_name_t *name, vpid = strtoul(token, NULL, 10); } -#if ORTE_ENABLE_EPOCH - token = strtok(NULL, ORTE_SCHEMA_DELIMITER_STRING); /** get next field -> epoch*/ - - /* check for error */ - if (NULL == token) { - ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); - return ORTE_ERR_BAD_PARAM; - } - - /* check for WILDCARD character - assign - * value accordingly, if found - */ - if (0 == strcmp(token, ORTE_SCHEMA_WILDCARD_STRING)) { - epoch = ORTE_EPOCH_WILDCARD; - } else if (0 == strcmp(token, ORTE_SCHEMA_INVALID_STRING)) { - epoch = ORTE_EPOCH_INVALID; - } else { - epoch = strtoul(token, NULL, 10); - } -#endif - name->jobid = job; name->vpid = vpid; - ORTE_EPOCH_SET(name->epoch,epoch); free(temp); @@ -583,17 +467,7 @@ int orte_util_convert_process_name_to_string(char **name_string, asprintf(&tmp2, "%s%c%lu", tmp, ORTE_SCHEMA_DELIMITER_CHAR, (unsigned long)name->vpid); } -#if ORTE_ENABLE_EPOCH - if (ORTE_EPOCH_WILDCARD == name->epoch) { - asprintf(name_string, "%s%c%s", tmp2, ORTE_SCHEMA_DELIMITER_CHAR, ORTE_SCHEMA_WILDCARD_STRING); - } else if (ORTE_EPOCH_INVALID == name->epoch) { - asprintf(name_string, "%s%c%s", tmp2, ORTE_SCHEMA_DELIMITER_CHAR, ORTE_SCHEMA_INVALID_STRING); - } else { - asprintf(name_string, "%s%c%lu", tmp2, ORTE_SCHEMA_DELIMITER_CHAR, (unsigned long)name->epoch); - } -#else asprintf(name_string, "%s", tmp2); -#endif free(tmp); free(tmp2); @@ -606,9 +480,6 @@ int orte_util_convert_process_name_to_string(char **name_string, int orte_util_create_process_name(orte_process_name_t **name, orte_jobid_t job, orte_vpid_t vpid -#if ORTE_ENABLE_EPOCH - ,orte_epoch_t epoch -#endif ) { *name = NULL; @@ -621,7 +492,6 @@ int orte_util_create_process_name(orte_process_name_t **name, (*name)->jobid = job; (*name)->vpid = vpid; - ORTE_EPOCH_SET((*name)->epoch,epoch); return ORTE_SUCCESS; } @@ -679,20 +549,6 @@ int orte_util_compare_name_fields(orte_ns_cmp_bitmask_t fields, } } -#if ORTE_ENABLE_EPOCH - /* Get here if jobid's and vpid's are equal, or not being checked. - * Now check epoch. - */ - - if (ORTE_NS_CMP_EPOCH & fields) { - if (name1->epoch < name2->epoch) { - return OPAL_VALUE2_GREATER; - } else if (name1->epoch> name2->epoch) { - return OPAL_VALUE1_GREATER; - } - } -#endif - /* only way to get here is if all fields are being checked and are equal, * or jobid not checked, but vpid equal, * only vpid being checked, and equal @@ -708,8 +564,6 @@ uint64_t orte_util_hash_name(const orte_process_name_t * name) { hash = name->jobid; hash <<= sizeof(name->jobid) * 8; hash += name->vpid; - /* Intentionally not using epoch. This would mess up the modex when a - * process is restarted. */ return hash; } diff --git a/orte/util/name_fns.h b/orte/util/name_fns.h index 27097efaf8..d5e35aa551 100644 --- a/orte/util/name_fns.h +++ b/orte/util/name_fns.h @@ -44,7 +44,6 @@ typedef uint8_t orte_ns_cmp_bitmask_t; /**< Bit mask for comparing process nam #define ORTE_NS_CMP_NONE 0x00 #define ORTE_NS_CMP_JOBID 0x02 #define ORTE_NS_CMP_VPID 0x04 -#define ORTE_NS_CMP_EPOCH 0x08 #define ORTE_NS_CMP_ALL 0x0f #define ORTE_NS_CMP_WILD 0x10 @@ -61,14 +60,6 @@ ORTE_DECLSPEC char* orte_util_print_vpids(const orte_vpid_t vpid); #define ORTE_VPID_PRINT(n) \ orte_util_print_vpids(n) -#if ORTE_ENABLE_EPOCH -ORTE_DECLSPEC char* orte_util_print_epoch(const orte_epoch_t epoch); -#define ORTE_EPOCH_PRINT(n) \ - orte_util_print_epoch(n) -#else -#define ORTE_EPOCH_PRINT(n) "" -#endif - ORTE_DECLSPEC char* orte_util_print_job_family(const orte_jobid_t job); #define ORTE_JOB_FAMILY_PRINT(n) \ orte_util_print_job_family(n) @@ -108,27 +99,9 @@ ORTE_DECLSPEC char *orte_pretty_print_timing(int64_t secs, int64_t usecs); #define ORTE_JOBID_IS_DAEMON(n) \ !((n) & 0x0000ffff) -/* Macro for getting the epoch out of the process name */ -#if ORTE_ENABLE_EPOCH -#define ORTE_EPOCH_GET(n) \ - ((n)->epoch) -#else -#define ORTE_EPOCH_GET(n) 0 -#endif - -/* Macro for setting the epoch in the process name */ -#if ORTE_ENABLE_EPOCH -#define ORTE_EPOCH_SET(n,m) \ - ( (n) = (m) ) -#else -#define ORTE_EPOCH_SET(n,m) \ - do { \ - } while(0); -#endif - /* List of names for general use */ struct orte_namelist_t { - opal_list_item_t item; /**< Allows this item to be placed on a list */ + opal_list_item_t super; /**< Allows this item to be placed on a list */ orte_process_name_t name; /**< Name of a process */ }; typedef struct orte_namelist_t orte_namelist_t; @@ -139,24 +112,14 @@ ORTE_DECLSPEC int orte_util_convert_jobid_to_string(char **jobid_string, const o ORTE_DECLSPEC int orte_util_convert_string_to_jobid(orte_jobid_t *jobid, const char* jobidstring); ORTE_DECLSPEC int orte_util_convert_vpid_to_string(char **vpid_string, const orte_vpid_t vpid); ORTE_DECLSPEC int orte_util_convert_string_to_vpid(orte_vpid_t *vpid, const char* vpidstring); -#if ORTE_ENABLE_EPOCH -ORTE_DECLSPEC int orte_util_convert_epoch_to_string(char **epoch_string, const orte_epoch_t epoch); -ORTE_DECLSPEC int orte_util_convert_string_to_epoch(orte_vpid_t *epoch, const char* epochstring); -#endif ORTE_DECLSPEC int orte_util_convert_string_to_process_name(orte_process_name_t *name, const char* name_string); ORTE_DECLSPEC int orte_util_convert_process_name_to_string(char** name_string, const orte_process_name_t *name); -#if ORTE_ENABLE_EPOCH -ORTE_DECLSPEC int orte_util_create_process_name(orte_process_name_t **name, - orte_jobid_t job, - orte_vpid_t vpid, - orte_epoch_t epoch); -#else ORTE_DECLSPEC int orte_util_create_process_name(orte_process_name_t **name, orte_jobid_t job, orte_vpid_t vpid); -#endif + ORTE_DECLSPEC int orte_util_compare_name_fields(orte_ns_cmp_bitmask_t fields, const orte_process_name_t* name1, const orte_process_name_t* name2); diff --git a/orte/util/nidmap.c b/orte/util/nidmap.c index cbffb95346..93faa1826e 100644 --- a/orte/util/nidmap.c +++ b/orte/util/nidmap.c @@ -268,7 +268,6 @@ int orte_util_build_daemon_nidmap(char **nodes) */ /* construct the URI */ proc.vpid = node->daemon; - ORTE_EPOCH_SET(proc.epoch,ORTE_EPOCH_MIN); orte_util_convert_process_name_to_string(&proc_name, &proc); asprintf(&uri, "%s;tcp://%s:%d", proc_name, addr, (int)orte_process_info.my_port); @@ -402,7 +401,7 @@ int orte_util_decode_nodemap(opal_byte_object_t *bo) int rc; uint8_t *oversub; - OPAL_OUTPUT_VERBOSE((2, orte_debug_output, + OPAL_OUTPUT_VERBOSE((1, orte_debug_output, "%s decode:nidmap decoding nodemap", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); @@ -432,7 +431,7 @@ int orte_util_decode_nodemap(opal_byte_object_t *bo) return rc; } - OPAL_OUTPUT_VERBOSE((2, orte_debug_output, + OPAL_OUTPUT_VERBOSE((1, orte_debug_output, "%s decode:nidmap decoding %d nodes", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), num_nodes)); @@ -510,7 +509,7 @@ int orte_util_decode_nodemap(opal_byte_object_t *bo) if (NULL == (nd = (orte_nid_t*)opal_pointer_array_get_item(&orte_nidmap, i))) { continue; } - opal_output(0, "%s node[%d].name %s daemon %s", + opal_output(5, "%s node[%d].name %s daemon %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), i, (NULL == nd->name) ? "NULL" : nd->name, ORTE_VPID_PRINT(nd->daemon)); @@ -947,7 +946,7 @@ orte_nid_t* orte_util_lookup_nid(orte_process_name_t *proc) { orte_pmap_t *pmap; - OPAL_OUTPUT_VERBOSE((5, orte_debug_output, + OPAL_OUTPUT_VERBOSE((10, orte_debug_output, "%s lookup:nid: looking for proc %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); @@ -1018,208 +1017,3 @@ void orte_jobmap_dump(void) } opal_output(orte_clean_output, "\n\n"); } - -#if 0 -/* Useful for debugging. Not used otherwise. */ -void print_orte_job_data() { - orte_job_t *jdata; - orte_proc_t *pdata; - int i, j; - - if (NULL == orte_job_data) { - opal_output(0, "ORTE_JOB_DATA == NULL"); - return; - } - - for (i = 0; i < orte_job_data->size; i++) { - if (NULL == (jdata = (orte_job_t *) opal_pointer_array_get_item(orte_job_data, i))) { - continue; - } - opal_output(0, "JOB: %s", ORTE_JOBID_PRINT(jdata->jobid)); - - for (j = 0; j < jdata->num_procs; j++) { - if (NULL == (pdata = (orte_proc_t *) opal_pointer_array_get_item(jdata->procs, j))) { - continue; - } - opal_output(0, " PROC: %s", ORTE_NAME_PRINT(&(pdata->name))); - } - } -} -#endif - -#if ORTE_ENABLE_EPOCH -/* Look up the current epoch value that we have stored locally. - * - * Note that this will not ping the HNP to get the most up to date epoch stored - * there, but it assumes that when it needs to know that the epoch has changed, - * someone will tell it. If you need the most up to date epoch, you should - * tell ask the hnp to refresh our information. - */ -orte_epoch_t orte_util_lookup_epoch(orte_process_name_t *proc) -{ - return get_epoch_from_orte_job_data(proc, ORTE_EPOCH_INVALID); -} - -/* Set the current epoch value that we have stored locally. - * - * This will update the currently stored local value for the epoch. - */ -orte_epoch_t orte_util_set_epoch(orte_process_name_t *proc, orte_epoch_t epoch) -{ - orte_epoch_t e = get_epoch_from_orte_job_data(proc, epoch); - /*print_orte_job_data();*/ - return e; -} -#endif - -#if ORTE_RESIL_ORTE -bool orte_util_proc_is_running(orte_process_name_t *proc) { - int i; - unsigned int j; - orte_job_t *jdata; - orte_proc_t *pdata; - - if (NULL == orte_job_data) { - return false; - } - - for (i = 0; i < orte_job_data->size; i++) { - if (NULL == (jdata = (orte_job_t *) opal_pointer_array_get_item(orte_job_data, i))) { - continue; - } else if (proc->jobid == jdata->jobid) { - for (j = 0; j < jdata->num_procs; j++) { - if (NULL == (pdata = (orte_proc_t *) opal_pointer_array_get_item(jdata->procs, j))) { - continue; - } else if (proc->vpid == pdata->name.vpid) { - return ORTE_PROC_STATE_TERMINATED > pdata->state; - } - } - } - } - - return true; -} - -int orte_util_set_proc_state(orte_process_name_t *proc, orte_proc_state_t state) { - int i; - unsigned int j; - orte_job_t *jdata; - orte_proc_t *pdata; - - if (NULL == orte_job_data) { - return ORTE_ERROR; - } - - for (i = 0; i < orte_job_data->size; i++) { - if (NULL == (jdata = (orte_job_t *) opal_pointer_array_get_item(orte_job_data, i))) { - continue; - } else if (proc->jobid == jdata->jobid) { - for (j = 0; j < jdata->num_procs; j++) { - if (NULL == (pdata = (orte_proc_t *) opal_pointer_array_get_item(jdata->procs, j))) { - continue; - } else if (proc->vpid == pdata->name.vpid) { - pdata->state = state; - return ORTE_SUCCESS; - } - } - } - } - - return ORTE_ERROR; -} -#endif - -#if ORTE_ENABLE_EPOCH -/* - * This function performs both the get and set operations on the epoch for a - * sepcific process name. If the epoch passed into the function is - * ORTE_EPOCH_INVALID, then we are performing a get operation. If the epoch is - * anything else, we are performing a set operation. - */ -orte_epoch_t get_epoch_from_orte_job_data(orte_process_name_t *proc, orte_epoch_t epoch) { - int ret, i; - unsigned int j; - orte_job_t *jdata; - orte_proc_t *pdata; - - if (ORTE_JOBID_INVALID == proc->jobid || - ORTE_VPID_INVALID == proc->vpid) { - return ORTE_EPOCH_INVALID; - } - - /* Sanity check just to make sure we don't overwrite our existing - * orte_job_data. - */ - if (NULL == orte_job_data) { - orte_job_data = OBJ_NEW(opal_pointer_array_t); - if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_job_data, - 1, - ORTE_GLOBAL_ARRAY_MAX_SIZE, - 1))) { - ORTE_ERROR_LOG(ret); - return ORTE_EPOCH_INVALID; - } - } - - /* Look to see if the job is in the orte_job_data. */ - for (i = 0; i < orte_job_data->size; i++) { - if (NULL == (jdata = (orte_job_t *) opal_pointer_array_get_item(orte_job_data, i))) { - continue; - } else if (proc->jobid == jdata->jobid) { - /* Found the right job, now look for the process. */ - for (j = 0; j < jdata->num_procs; j++) { - if (NULL == (pdata = (orte_proc_t *) opal_pointer_array_get_item(jdata->procs, j))) { - continue; - } else if (proc->vpid == pdata->name.vpid) { - if (ORTE_EPOCH_INVALID != epoch) { - pdata->name.epoch = epoch; - } - return pdata->name.epoch; - } - } - - /* Found the right job but didn't find the process in it. Create the - * process if necessary. - */ - if (ORTE_EPOCH_INVALID != epoch) { - pdata = OBJ_NEW(orte_proc_t); - pdata->name.jobid = proc->jobid; - pdata->name.vpid = proc->vpid; - pdata->name.epoch = epoch; - - pdata->state = ORTE_PROC_STATE_TERMINATED; - - opal_pointer_array_add(jdata->procs, pdata); - jdata->num_procs++; - - return pdata->name.epoch; - } else { - return ORTE_EPOCH_MIN; - } - } - } - - /* Didn't find the right job, add a new job structure and a new process. */ - if (ORTE_EPOCH_INVALID != epoch) { - jdata = OBJ_NEW(orte_job_t); - jdata->jobid = proc->jobid; - - pdata = OBJ_NEW(orte_proc_t); - pdata->name.jobid = proc->jobid; - pdata->name.vpid = proc->vpid; - pdata->name.epoch = epoch; - - pdata->state = ORTE_PROC_STATE_TERMINATED; - - opal_pointer_array_add(jdata->procs, pdata); - jdata->num_procs++; - - opal_pointer_array_add(orte_job_data, jdata); - - return pdata->name.epoch; - } else { - return ORTE_EPOCH_MIN; - } -} -#endif - diff --git a/orte/util/nidmap.h b/orte/util/nidmap.h index abc5ec695f..884b491984 100644 --- a/orte/util/nidmap.h +++ b/orte/util/nidmap.h @@ -48,20 +48,8 @@ ORTE_DECLSPEC orte_jmap_t* orte_util_lookup_jmap(orte_jobid_t job); ORTE_DECLSPEC orte_pmap_t* orte_util_lookup_pmap(orte_process_name_t *proc); ORTE_DECLSPEC orte_nid_t* orte_util_lookup_nid(orte_process_name_t *proc); -#if ORTE_ENABLE_EPOCH -ORTE_DECLSPEC orte_epoch_t orte_util_lookup_epoch(orte_process_name_t *proc); -ORTE_DECLSPEC orte_epoch_t orte_util_set_epoch(orte_process_name_t *proc, orte_epoch_t epoch); -#endif - ORTE_DECLSPEC int orte_util_set_proc_state(orte_process_name_t *proc, orte_proc_state_t state); -#if ORTE_RESIL_ORTE -#define PROC_IS_RUNNING(n) orte_util_proc_is_running(n) -ORTE_DECLSPEC bool orte_util_proc_is_running(orte_process_name_t *proc); -#else -#define PROC_IS_RUNNING(n) ( true ) -#endif - ORTE_DECLSPEC int orte_util_encode_nodemap(opal_byte_object_t *boptr); ORTE_DECLSPEC int orte_util_decode_nodemap(opal_byte_object_t *boptr); @@ -76,9 +64,4 @@ ORTE_DECLSPEC void orte_jobmap_dump(void); END_C_DECLS -/* Local functions */ -#if ORTE_ENABLE_EPOCH -orte_epoch_t get_epoch_from_orte_job_data(orte_process_name_t *proc, orte_epoch_t epoch); -#endif - #endif diff --git a/orte/util/proc_info.c b/orte/util/proc_info.c index ab7c25316a..f69b8cd4bd 100644 --- a/orte/util/proc_info.c +++ b/orte/util/proc_info.c @@ -36,11 +36,7 @@ #include "orte/util/proc_info.h" -#if ORTE_ENABLE_EPOCH -#define ORTE_NAME_INVALID {ORTE_JOBID_INVALID, ORTE_VPID_INVALID, ORTE_EPOCH_MIN} -#else #define ORTE_NAME_INVALID {ORTE_JOBID_INVALID, ORTE_VPID_INVALID} -#endif ORTE_DECLSPEC orte_proc_info_t orte_process_info = { /* .my_name = */ ORTE_NAME_INVALID, @@ -73,10 +69,10 @@ ORTE_DECLSPEC orte_proc_info_t orte_process_info = { /* .bind_level = */ OPAL_HWLOC_NODE_LEVEL, /* .bind_idx = */ 0, #endif - /* .job_name = */ NULL, - /* .job_instance = */ NULL, - /* .executable = */ NULL, - /* .app_rank = */ -1 + /* .app_rank = */ -1, + /* .peer_modex = */ -1, + /* .peer_init_barrier = */ -1, + /* .peer_fini_barrier = */ -1 }; static bool init=false; @@ -157,18 +153,6 @@ int orte_proc_info(void) true, false, 0, &tmp); orte_process_info.num_restarts = tmp; - mca_base_param_reg_string_name("orte", "job_name", - "Job name", - true, false, NULL, &orte_process_info.job_name); - - mca_base_param_reg_string_name("orte", "job_instance", - "Job instance", - true, false, NULL, &orte_process_info.job_instance); - - mca_base_param_reg_string_name("orte", "executable", - "Executable", - true, false, NULL, &orte_process_info.executable); - mca_base_param_reg_int_name("orte", "app_rank", "Rank of this proc within its app_context", true, false, 0, &tmp); @@ -184,6 +168,19 @@ int orte_proc_info(void) /* setup the sync buffer */ orte_process_info.sync_buf = OBJ_NEW(opal_buffer_t); + /* get the collective id info */ + mca_base_param_reg_int_name("orte", "peer_modex_id", "Peer modex collective id", + true, false, -1, &tmp); + orte_process_info.peer_modex = (orte_grpcomm_coll_id_t)tmp; + + mca_base_param_reg_int_name("orte", "peer_init_barrier_id", "Peer init barrier collective id", + true, false, -1, &tmp); + orte_process_info.peer_init_barrier = (orte_grpcomm_coll_id_t)tmp; + + mca_base_param_reg_int_name("orte", "peer_fini_barrier_id", "Peer finalize barrier collective id", + true, false, -1, &tmp); + orte_process_info.peer_fini_barrier = (orte_grpcomm_coll_id_t)tmp; + return ORTE_SUCCESS; } diff --git a/orte/util/proc_info.h b/orte/util/proc_info.h index 7743791770..8e3b76e79c 100644 --- a/orte/util/proc_info.h +++ b/orte/util/proc_info.h @@ -9,6 +9,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -41,6 +43,8 @@ #include "opal/dss/dss_types.h" #include "opal/mca/hwloc/hwloc.h" +#include "orte/mca/grpcomm/grpcomm_types.h" + BEGIN_C_DECLS #define ORTE_MAX_HOSTNAME_SIZE 512 @@ -117,11 +121,10 @@ struct orte_proc_info_t { opal_hwloc_level_t bind_level; unsigned int bind_idx; #endif - /* name/instance info for debug support */ - char *job_name; - char *job_instance; - char *executable; int32_t app_rank; + orte_grpcomm_coll_id_t peer_modex; /**< modex collective id */ + orte_grpcomm_coll_id_t peer_init_barrier; /**< barrier id during init */ + orte_grpcomm_coll_id_t peer_fini_barrier; /**< barrier id during finalize */ }; typedef struct orte_proc_info_t orte_proc_info_t; diff --git a/orte/util/show_help.c b/orte/util/show_help.c index 094a62a9da..378fdd9d00 100644 --- a/orte/util/show_help.c +++ b/orte/util/show_help.c @@ -482,7 +482,7 @@ static int show_help(const char *filename, const char *topic, if (now > show_help_time_last_displayed + 5 && !show_help_timer_set) { show_accumulated_duplicates(0, 0, NULL); } else if (!show_help_timer_set) { - opal_event_evtimer_set(opal_event_base, &show_help_timer_event, + opal_event_evtimer_set(orte_event_base, &show_help_timer_event, show_accumulated_duplicates, NULL); opal_event_evtimer_add(&show_help_timer_event, &show_help_interval); show_help_timer_set = true; @@ -519,7 +519,7 @@ static int show_help(const char *filename, const char *topic, return rc; } pnli->name = *sender; - opal_list_append(&(tli->tli_processes), &(pnli->item)); + opal_list_append(&(tli->tli_processes), &(pnli->super)); } return ORTE_SUCCESS; } @@ -583,13 +583,7 @@ cleanup: if (NULL != topic) { free(topic); } - /* reissue the recv */ - rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_SHOW_HELP, - ORTE_RML_NON_PERSISTENT, orte_show_help_recv, NULL); - if (rc != ORTE_SUCCESS && rc != ORTE_ERR_NOT_IMPLEMENTED) { - ORTE_ERROR_LOG(rc); - } -} + } int orte_show_help_init(void) { diff --git a/test/util/Makefile.am b/test/util/Makefile.am index 75cee624c6..5aebd9d894 100644 --- a/test/util/Makefile.am +++ b/test/util/Makefile.am @@ -9,6 +9,8 @@ # University of Stuttgart. All rights reserved. # Copyright (c) 2004-2005 The Regents of the University of California. # All rights reserved. +# Copyright (c) 2012 Los Alamos National Security, LLC. All rights +# reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -32,8 +34,7 @@ AM_CPPFLAGS = -I$(top_srcdir)/test/support check_PROGRAMS = \ opal_bit_ops \ - opal_path_nfs \ - opal_sos + opal_path_nfs TESTS = \ $(check_PROGRAMS) @@ -80,12 +81,6 @@ opal_path_nfs_LDADD = \ $(top_builddir)/test/support/libsupport.a opal_path_nfs_DEPENDENCIES = $(opal_path_nfs_LDADD) -opal_sos_SOURCES = opal_sos.c -opal_sos_LDADD = \ - $(top_builddir)/opal/libopen-pal.la \ - $(top_builddir)/test/support/libsupport.a -opal_sos_DEPENDENCIES = $(opal_error_LDADD) - #opal_os_path_SOURCES = opal_os_path.c #opal_os_path_LDADD = \ # $(top_builddir)/opal/libopen-pal.la \ diff --git a/test/util/opal_sos.c b/test/util/opal_sos.c deleted file mode 100644 index 85f0f92559..0000000000 --- a/test/util/opal_sos.c +++ /dev/null @@ -1,178 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "opal_config.h" - -#include -#include -#include -#ifdef HAVE_SYS_PARAM_H -#include -#endif -#ifdef HAVE_NETINET_IN_H -#include -#endif -#ifdef HAVE_UNISTD_H -#include -#endif -#ifdef HAVE_NETDB_H -#include -#endif -#include - -#include "support.h" -#include "opal/constants.h" -#include "opal/runtime/opal.h" -#include "opal/util/opal_sos.h" -#include "opal/util/show_help.h" -#include "opal/util/output.h" -#include "orte/runtime/runtime.h" -#include "orte/constants.h" - -static bool opal_sos_test(void); - -int -main(int argc, char *argv[]) -{ - opal_init_util(&argc, &argv); - test_init("opal_sos test suite"); - opal_sos_test(); - opal_finalize(); - test_finalize(); - return 0; -} - -/** OPAL_SOS_* macro test */ -static bool opal_sos_test(void) -{ - int errnum1 = 0, errnum2 = 0; - char *err_str; - - /* Checking for the correctness of GET_ and SET_ error code - * operations */ - errnum1 = OPAL_SOS_GET_ERROR_CODE(OPAL_ERR_OUT_OF_RESOURCE); - test_verify("failed", OPAL_ERR_OUT_OF_RESOURCE == errnum1); - - OPAL_SOS_SET_ERROR_CODE(errnum1, OPAL_ERR_IN_ERRNO); - test_verify("failed", OPAL_ERR_IN_ERRNO == - OPAL_SOS_GET_ERROR_CODE(errnum1)); - - /* Check if OPAL_ERR_OUT_OF_RESOURCE is a native error code or - * not. Since OPAL_ERR_OUT_OF_RESOURCE is native, this should - * return true. */ - test_verify("failed", true == - OPAL_SOS_IS_NATIVE(OPAL_ERR_OUT_OF_RESOURCE)); - - test_verify("failed", true == OPAL_SOS_IS_NATIVE(errnum1)); - - /* Encode a native error (OPAL_ERR_OUT_OF_RESOURCE) by - * logging it in the SOS framework using one of the SOS - * reporter macros. This returns an encoded error code - * (errnum1) with information about the native error such - * as the severity, the native error code, the attached - * error index etc. */ - errnum1 = OPAL_SOS_INFO((OPAL_ERR_OUT_OF_RESOURCE, false, - "Error %d: out of resource", - OPAL_ERR_OUT_OF_RESOURCE)); - - /* Check if errnum1 is native or not. This should return false */ - test_verify("failed", false == OPAL_SOS_IS_NATIVE(errnum1)); - test_verify("failed", - OPAL_SOS_SEVERITY_INFO == OPAL_SOS_GET_SEVERITY(errnum1)); - - /* Extract the native error code out of errnum1. This should - * return the encoded native error code associated with errnum1 - * (i.e. OPAL_ERR_OUT_OF_RESOURCE). */ - test_verify("failed", OPAL_ERR_OUT_OF_RESOURCE == - OPAL_SOS_GET_ERROR_CODE(errnum1)); - - /* We log another error event as a child of the previous error - * errnum1. In the process, we decide to raise the severity - * level from INFO to WARN. */ - err_str = opal_output_string(0, 0, "my error string -100"); - errnum1 = OPAL_SOS_WARN((errnum1, false, err_str)); - test_verify("failed", - OPAL_SOS_SEVERITY_WARN == OPAL_SOS_GET_SEVERITY(errnum1)); - - test_verify("failed", OPAL_ERR_OUT_OF_RESOURCE == - OPAL_SOS_GET_ERROR_CODE(errnum1)); - free(err_str); - - /* Let's report another event with severity ERROR using - * OPAL_SOS_ERROR() and in effect promote errnum1 to - * severity 'ERROR'. */ - err_str = opal_show_help_string("help-opal-util.txt", - "stacktrace signal override", - false, 10, 10, 10, "15"); - /* If OMPI isn't installed yet (which is quite likely/possible if - we're running "make check"!), then the show_help_string will - return NULL. So just put in any old string. */ - if (NULL == err_str) { - err_str = strdup("Open MPI does not appear to be installed, so we'll just substitue in a random message to show during the opal_sos test. You can ignore the above warning about not finding the help message about 'stacktrace signal override'."); - } - errnum1 = OPAL_SOS_ERROR((errnum1, false, err_str)); - test_verify("failed", - OPAL_SOS_SEVERITY_ERROR == OPAL_SOS_GET_SEVERITY(errnum1)); - free(err_str); - - /* Check the native code associated with the previously encoded - * error. This should still return (OPAL_ERR_OUT_OF_RESOURCE) - * since the entire error history originates from the native - * error OPAL_ERR_OUT_OF_RESOURCE */ - test_verify("failed", OPAL_ERR_OUT_OF_RESOURCE == - OPAL_SOS_GET_ERROR_CODE(errnum1)); - - /* We start off another error history stack originating with a - * native error, ORTE_ERR_FATAL. */ - asprintf(&err_str, "Fatal error occurred in ORTE %d", errnum1); - errnum2 = OPAL_SOS_ERROR((ORTE_ERR_FATAL, true, err_str)); - free(err_str); - test_verify("failed", - OPAL_SOS_SEVERITY_ERROR == OPAL_SOS_GET_SEVERITY(errnum2)); - test_verify("failed", OPAL_ERR_FATAL == - OPAL_SOS_GET_ERROR_CODE(errnum2)); - - /* Registering another error with severity ERROR. - * There is no change in the severity */ - errnum2 = OPAL_SOS_WARN((errnum2, false, "this process must die.")); - test_verify("failed", - OPAL_SOS_SEVERITY_WARN == OPAL_SOS_GET_SEVERITY(errnum2)); - test_verify("failed", OPAL_ERR_FATAL == - OPAL_SOS_GET_ERROR_CODE(errnum2)); - - /* We attach the two error traces originating from errnum1 - * and errnum2. The "attached error index" in errnum1 is - * set to errnum2 to indicate that the two error stacks - * are forked down from this point on. */ - OPAL_SOS_ATTACH(errnum1, errnum2); - - /* Print out the entire error event history originating from errnum1 */ -#if 0 - printf("<------ BEGIN output of OPAL SOS error message ------->\n"); - OPAL_SOS_PRINT(errnum1, true); - printf("<------ END output of OPAL SOS error message ------->\n"); -#endif - test_success(); - - /* Cleanup */ - OPAL_SOS_FREE(&errnum1); - OPAL_SOS_FREE(&errnum2); - - return true; -} diff --git a/test/util/orte_session_dir.c b/test/util/orte_session_dir.c index 3fe6d22686..a089ef38c6 100644 --- a/test/util/orte_session_dir.c +++ b/test/util/orte_session_dir.c @@ -57,7 +57,6 @@ int main(int argc, char* argv[]) orte_process_info.my_name->cellid = 0; orte_process_info.my_name->jobid = 0; orte_process_info.my_name->vpid = 0; - ORTE_EPOCH_SET(orte_process_info.my_name->epoch,ORTE_EPOCH_MIN); test_init("orte_session_dir_t"); test_out = fopen( "test_session_dir_out", "w+" );