From a753c3ece0ba698800d274ee8465a2d8c520a959 Mon Sep 17 00:00:00 2001 From: Howard Pritchard Date: Fri, 21 Nov 2014 12:26:12 -0700 Subject: [PATCH 1/7] ess/alps: add initial alps ess component Note this alps ess component has nothing to do with the old CNOS alps component used on Cray Seastar/Portals3 (Cray XT) systems. To work properly, changes need to be made to the open method of the ess/pmi component to keep it from selecting, and thus initializing, the opal/pmix/cray component. --- orte/mca/ess/alps/Makefile.am | 50 +++++ orte/mca/ess/alps/configure.m4 | 52 ++++++ orte/mca/ess/alps/ess_alps.h | 56 ++++++ orte/mca/ess/alps/ess_alps_component.c | 123 +++++++++++++ orte/mca/ess/alps/ess_alps_module.c | 229 +++++++++++++++++++++++ orte/mca/ess/alps/ess_alps_utils.c | 241 +++++++++++++++++++++++++ 6 files changed, 751 insertions(+) create mode 100644 orte/mca/ess/alps/Makefile.am create mode 100644 orte/mca/ess/alps/configure.m4 create mode 100644 orte/mca/ess/alps/ess_alps.h create mode 100644 orte/mca/ess/alps/ess_alps_component.c create mode 100644 orte/mca/ess/alps/ess_alps_module.c create mode 100644 orte/mca/ess/alps/ess_alps_utils.c diff --git a/orte/mca/ess/alps/Makefile.am b/orte/mca/ess/alps/Makefile.am new file mode 100644 index 0000000000..e647127d8a --- /dev/null +++ b/orte/mca/ess/alps/Makefile.am @@ -0,0 +1,50 @@ +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2008-2010 Cisco Systems, Inc. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +sources = \ + ess_alps.h \ + ess_alps_component.c \ + ess_alps_module.c \ + ess_alps_utils.c + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +if MCA_BUILD_orte_ess_alps_DSO +component_noinst = +component_install = mca_ess_alps.la +else +component_noinst = libmca_ess_alps.la +component_install = +endif + +mcacomponentdir = $(ompilibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_ess_alps_la_SOURCES = $(sources) +mca_ess_alps_la_CPPFLAGS = $(ess_alps_CPPFLAGS) -fno-ident +mca_ess_alps_la_LDFLAGS = -module -avoid-version $(ess_alps_LDFLAGS) +mca_ess_alps_la_LIBADD = $(ess_alps_LDFLAGS) + +noinst_LTLIBRARIES = $(component_noinst) +libmca_ess_alps_la_SOURCES =$(sources) +libmca_ess_alps_la_CPPFLAGS = $(ess_alps_CPPFLAGS) +libmca_ess_alps_la_LDFLAGS = -module -avoid-version $(ess_alps_LDFLAGS) +libmca_ess_alps_la_LIBADD = $(ess_alps_LIBS) + diff --git a/orte/mca/ess/alps/configure.m4 b/orte/mca/ess/alps/configure.m4 new file mode 100644 index 0000000000..ee6ecb35d8 --- /dev/null +++ b/orte/mca/ess/alps/configure.m4 @@ -0,0 +1,52 @@ +# -*- shell-script -*- +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2009-2012 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2011 Los Alamos National Security, LLC. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# MCA_ess_alps_CONFIG([action-if-found], [action-if-not-found]) +# ----------------------------------------------------------- +AC_DEFUN([MCA_orte_ess_alps_CONFIG],[ + AC_CONFIG_FILES([orte/mca/ess/alps/Makefile]) + + ess_alps_lli_happy="no" + ess_alps_util_happy="no" + + PKG_CHECK_MODULES([CRAY_ALPS_LLI], [cray-alpslli], + [ess_alps_CPPFLAGS=$CRAY_ALPS_LLI_CFLAGS + ess_alps_LDFLAGS=$CRAY_ALPS_LLI_LIBS + ess_alps_LIBS=$CRAY_ALPS_LLI_LIBS + ess_alps_lli_happy="yes"], + [AC_MSG_RESULT([no])]) + + PKG_CHECK_MODULES([CRAY_ALPS_UTIL], [cray-alpsutil], + [ess_alps_CPPFLAGS="$ess_alps_CPPFLAGS $CRAY_ALPS_UTIL_CFLAGS" + ess_alps_LDFLAGS="$ess_alps_LDFLAGS $CRAY_ALPS_UTIL_LIBS" + ess_alps_LIBS="$ess_alps_LIBS $CRAY_ALPS_LLI_LIBS" + ess_alps_util_happy="yes"], + [AC_MSG_RESULT([no])]) + + AS_IF([test "$ess_alps_lli_happy" = "yes" -a "$ess_alps_util_happy" = "yes"], + [$1 + AC_SUBST([ess_alps_CPPFLAGS]) + AC_SUBST([ess_alps_LDFLAGS]) + AC_SUBST([ess_alps_LIBS])], + [$2]) + +])dnl diff --git a/orte/mca/ess/alps/ess_alps.h b/orte/mca/ess/alps/ess_alps.h new file mode 100644 index 0000000000..70c4098c6a --- /dev/null +++ b/orte/mca/ess/alps/ess_alps.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef ORTE_ESS_ALPS_H +#define ORTE_ESS_ALPS_H + +#include "orte_config.h" +#include "opal/mca/mca.h" +#include "orte/mca/ess/ess.h" + +#include "alps/alps.h" +#include "alps/alps_toolAssist.h" +#include "alps/libalpsutil.h" +#include "alps/libalpslli.h" + +BEGIN_C_DECLS + +/* + * Module open / close + */ +int orte_ess_alps_component_open(void); +int orte_ess_alps_component_close(void); +int orte_ess_alps_component_query(mca_base_module_t **module, int *priority); + +/* + * alps component internal utility functions + */ + +int orte_ess_alps_get_first_rank_on_node(int *first_rank); +int orte_ess_alps_sync_start(void); +int orte_ess_alps_sync_complete(void); + +/* + * ODLS Alps module + */ +extern orte_ess_base_module_t orte_ess_alps_module; +ORTE_MODULE_DECLSPEC extern orte_ess_base_component_t mca_ess_alps_component; + +END_C_DECLS + +#endif /* ORTE_ESS_ALPS_H */ diff --git a/orte/mca/ess/alps/ess_alps_component.c b/orte/mca/ess/alps/ess_alps_component.c new file mode 100644 index 0000000000..5f2f8c3a30 --- /dev/null +++ b/orte/mca/ess/alps/ess_alps_component.c @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + * + * These symbols are in a file by themselves to provide nice linker + * semantics. Since linkers generally pull in symbols by object + * files, keeping these symbols as the only symbols in this file + * prevents utility programs such as "ompi_info" from having to import + * entire components just to query their version and parameters. + */ + +#include "orte_config.h" +#include "orte/constants.h" +#include "orte/runtime/orte_globals.h" +#include "orte/util/proc_info.h" +#include "orte/mca/ess/ess.h" +#include "orte/mca/ess/base/base.h" +#include "orte/mca/ess/alps/ess_alps.h" + +#include + + +/* + * Instantiate the public struct with all of our public information + * and pointers to our public functions in it + */ +orte_ess_base_component_t mca_ess_alps_component = { + /* First, the mca_component_t struct containing meta information + about the component itself */ + { + ORTE_ESS_BASE_VERSION_3_0_0, + + /* Component name and version */ + "alps", + ORTE_MAJOR_VERSION, + ORTE_MINOR_VERSION, + ORTE_RELEASE_VERSION, + + /* Component open and close functions */ + orte_ess_alps_component_open, + orte_ess_alps_component_close, + orte_ess_alps_component_query + }, + { + /* The component is not checkpoint ready */ + MCA_BASE_METADATA_PARAM_NONE + } +}; + +int +orte_ess_alps_component_open(void) +{ + return ORTE_SUCCESS; +} + +int orte_ess_alps_component_query(mca_base_module_t **module, int *priority) +{ + int rc = ORTE_SUCCESS; + const char proc_job_file[]="/proc/job"; + FILE *fd = NULL, *fd_task_is_app = NULL; + char task_is_app_fname[PATH_MAX]; + + /* + * don't use the alps ess component if an app proc + */ + + if (ORTE_PROC_IS_APP) { + *priority = 0; + *module = NULL; + return ORTE_ERROR; + } + + /* + * make sure we're in a Cray PAGG container, and that we are also on + * a compute node (i.e. we are thought of as an application task by + * the cray job kernel module - the thing that creates the PAGG) + */ + + /* disqualify ourselves if not running in a Cray PAGG container */ + fd = fopen(proc_job_file, "r"); + if (fd == NULL) { + *priority = 0; + *module = NULL; + rc = ORTE_ERROR; + } else { + snprintf(task_is_app_fname,sizeof(task_is_app_fname), + "/proc/self/task/%ld/task_is_app",syscall(SYS_gettid)); + fd_task_is_app = fopen(task_is_app_fname, "r"); + if (fd_task_is_app != NULL) { /* okay we're in a PAGG container, + and we are an app task (not just a process + running on a mom node, for example), + so we should give cray pmi a shot. */ + *priority = 35; /* take precendence over base */ + *module = (mca_base_module_t *) &orte_ess_alps_module; + fclose(fd_task_is_app); + } + fclose(fd); + } + + return rc; +} + +int +orte_ess_alps_component_close(void) +{ + return ORTE_SUCCESS; +} + diff --git a/orte/mca/ess/alps/ess_alps_module.c b/orte/mca/ess/alps/ess_alps_module.c new file mode 100644 index 0000000000..b557b155ad --- /dev/null +++ b/orte/mca/ess/alps/ess_alps_module.c @@ -0,0 +1,229 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2011 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011-2013 Los Alamos National Security, LLC. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + * + */ + +#include "orte_config.h" +#include "orte/constants.h" + +#include "orte/util/show_help.h" +#include "opal/util/argv.h" + +#include "orte/util/proc_info.h" +#include "orte/mca/errmgr/base/base.h" +#include "orte/util/name_fns.h" +#include "orte/util/nidmap.h" +#include "orte/util/regex.h" +#include "orte/runtime/orte_globals.h" + +#include "orte/mca/ess/ess.h" +#include "orte/mca/ess/base/base.h" +#include "orte/mca/ess/alps/ess_alps.h" + +#include + +static int alps_set_name(void); +static int rte_init(void); +static int rte_finalize(void); + +orte_ess_base_module_t orte_ess_alps_module = { + rte_init, + rte_finalize, + orte_ess_base_app_abort, + NULL /* ft_event */ +}; + +/* Local variables */ +static orte_vpid_t starting_vpid = 0; + + +static int rte_init(void) +{ + int ret, i; + char *error = NULL; + char **hosts = NULL; + + OPAL_OUTPUT_VERBOSE((1, orte_ess_base_framework.framework_output, + "ess:alps in rte_init")); + + /* + * shouldn't have been able to open this ess component if + * process is app proc + */ + + if (ORTE_PROC_IS_APP) { + error = "mpi rank invoking alps rte_init"; + ret = ORTE_ERR_NOT_SUPPORTED; + goto fn_fail; + } + + /* run the prolog */ + if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) { + error = "orte_ess_base_std_prolog"; + goto fn_fail; + } + + if (ORTE_SUCCESS != (ret = alps_set_name())) { + error = "alps_set_name"; + goto fn_fail; + } + + /* + * if I am a daemon, complete my setup using the + * default procedure + */ + if (ORTE_PROC_IS_DAEMON) { + if (NULL != orte_node_regex) { + /* extract the nodes */ + if (ORTE_SUCCESS != (ret = + orte_regex_extract_node_names(orte_node_regex, &hosts)) || + NULL == hosts) { + error = "orte_regex_extract_node_names"; + goto fn_fail; + } + + /* find our host in the list */ + for (i=0; NULL != hosts[i]; i++) { + if (0 == strncmp(hosts[i], orte_process_info.nodename, + strlen(hosts[i]))) { + /* correct our vpid - this is probably not necessary with aprun*/ + ORTE_PROC_MY_NAME->vpid = starting_vpid + i; + OPAL_OUTPUT_VERBOSE((1, orte_ess_base_framework.framework_output, + "ess:alps reset name to %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + break; + } + } + } + if (ORTE_SUCCESS != (ret = orte_ess_base_orted_setup(hosts))) { + ORTE_ERROR_LOG(ret); + error = "orte_ess_base_orted_setup"; + goto fn_fail; + } + if (NULL != hosts) { + opal_argv_free(hosts); + } + + /* + * now synchronize with aprun. + */ + + if (ORTE_SUCCESS != (ret = orte_ess_alps_sync_start())) { + error = "orte_ess_alps_sync"; + goto fn_fail; + } + + ret = ORTE_SUCCESS; + goto fn_exit; + } + + if (ORTE_PROC_IS_TOOL) { + /* otherwise, if I am a tool proc, use that procedure */ + if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup())) { + ORTE_ERROR_LOG(ret); + error = "orte_ess_base_tool_setup"; + goto fn_fail; + } + /* as a tool, I don't need a nidmap - so just return now */ + ret = ORTE_SUCCESS; + goto fn_exit; + } + + fn_exit: + return ret; + + fn_fail: + if (ORTE_ERR_SILENT != ret && !orte_report_silent_errors) { + orte_show_help("help-orte-runtime.txt", + "orte_init:startup:internal-failure", + true, error, ORTE_ERROR_NAME(ret), ret); + } + goto fn_exit; +} + +static int rte_finalize(void) +{ + int ret = ORTE_SUCCESS; + + /* if I am a daemon, finalize using the default procedure */ + if (ORTE_PROC_IS_DAEMON) { + if (ORTE_SUCCESS != (ret = orte_ess_base_orted_finalize())) { + ORTE_ERROR_LOG(ret); + goto fn_exit; + } + + /* notify alps that we're done */ + if (ORTE_SUCCESS != (ret = orte_ess_alps_sync_complete())) { + ORTE_ERROR_LOG(ret); + } + + } else if (ORTE_PROC_IS_TOOL) { + /* otherwise, if I am a tool proc, use that procedure */ + if (ORTE_SUCCESS != (ret = orte_ess_base_tool_finalize())) { + ORTE_ERROR_LOG(ret); + } + } + + fn_exit: + return ret; +} + +static int alps_set_name(void) +{ + int rc; + int rank; + orte_jobid_t jobid; + + if (NULL == orte_ess_base_jobid) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + return ORTE_ERR_NOT_FOUND; + } + if (ORTE_SUCCESS != (rc = orte_util_convert_string_to_jobid(&jobid, orte_ess_base_jobid))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + if (NULL == orte_ess_base_vpid) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + return ORTE_ERR_NOT_FOUND; + } + if (ORTE_SUCCESS != (rc = orte_util_convert_string_to_vpid(&starting_vpid, + orte_ess_base_vpid))) { + ORTE_ERROR_LOG(rc); + return(rc); + } + + ORTE_PROC_MY_NAME->jobid = jobid; + + if (ORTE_SUCCESS != (rc = orte_ess_alps_get_first_rank_on_node(&rank))) { + ORTE_ERROR_LOG(rc); + return(rc); + } + + ORTE_PROC_MY_NAME->vpid = (orte_vpid_t)rank + starting_vpid; + + /* get the num procs as provided in the cmd line param */ + if (ORTE_SUCCESS != (rc = orte_ess_env_get())) { + ORTE_ERROR_LOG(rc); + return rc; + } + + return ORTE_SUCCESS; +} diff --git a/orte/mca/ess/alps/ess_alps_utils.c b/orte/mca/ess/alps/ess_alps_utils.c new file mode 100644 index 0000000000..cc7054638b --- /dev/null +++ b/orte/mca/ess/alps/ess_alps_utils.c @@ -0,0 +1,241 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2011 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011-2013 Los Alamos National Security, LLC. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + * + */ + +#include "orte_config.h" +#include "orte/constants.h" + +#include "orte/util/show_help.h" +#include "opal/util/argv.h" + +#include "orte/util/proc_info.h" +#include "orte/mca/errmgr/base/base.h" +#include "orte/util/name_fns.h" +#include "orte/util/nidmap.h" +#include "orte/util/regex.h" +#include "orte/runtime/orte_globals.h" + +#include "orte/mca/ess/ess.h" +#include "orte/mca/ess/base/base.h" +#include "orte/mca/ess/alps/ess_alps.h" + +/* + * use the Alps placement file to obtain + * the global rank of the "first" local rank + * on the node. + */ + +int +orte_ess_alps_get_first_rank_on_node(int *first_rank) +{ + int alps_status = 0; + uint64_t apid; + size_t alps_count; + int ret = ORTE_SUCCESS; + int lli_ret = 0, place_ret; + alpsAppLayout_t orted_layout; + + if (first_rank == NULL) { + ret = ORTE_ERR_BAD_PARAM; + goto fn_exit; + } + + /* + * First get our apid + */ + + lli_ret = alps_app_lli_lock(); + if (0 != ret) { + OPAL_OUTPUT_VERBOSE((20, orte_ess_base_framework.framework_output, + "%s ess:alps: alps_app_lli_lock returned %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), lli_ret)); + ret = ORTE_ERR_FILE_WRITE_FAILURE; + goto fn_exit; + } + + lli_ret = alps_app_lli_put_request(ALPS_APP_LLI_ALPS_REQ_APID, NULL, 0); + if (ALPS_APP_LLI_ALPS_STAT_OK != lli_ret) { + OPAL_OUTPUT_VERBOSE((20, orte_ess_base_framework.framework_output, + "%s ess:alps: alps_app_lli_put_request - APID returned %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), lli_ret)); + ret = ORTE_ERR_FILE_WRITE_FAILURE; + goto fn_exit_w_lock; + } + + lli_ret = alps_app_lli_get_response (&alps_status, &alps_count); + if (ALPS_APP_LLI_ALPS_STAT_OK != alps_status) { + OPAL_OUTPUT_VERBOSE((20, orte_ess_base_framework.framework_output, + "%s ess:alps: alps_app_lli_get_response returned %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), alps_status)); + ret = ORTE_ERR_FILE_READ_FAILURE; + goto fn_exit_w_lock; + } + + lli_ret = alps_app_lli_get_response_bytes (&apid, sizeof(apid)); + if (ALPS_APP_LLI_ALPS_STAT_OK != lli_ret) { + OPAL_OUTPUT_VERBOSE((20, orte_ess_base_framework.framework_output, + "%s ess:alps: alps_app_lli_get_response_bytes returned %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), lli_ret)); + ret = ORTE_ERR_FILE_READ_FAILURE; + goto fn_exit_w_lock; + } + + place_ret = alps_get_placement_info(apid, + &orted_layout, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL); + if (1 != place_ret) { + OPAL_OUTPUT_VERBOSE((20, orte_ess_base_framework.framework_output, + "%s ess:alps: alps_get_placement_info returned %d (%s)", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), place_ret, strerror(errno))); + ret = ORTE_ERROR; + goto fn_exit; + } + + OPAL_OUTPUT_VERBOSE((2, orte_ess_base_framework.framework_output, + "%s ess:alps: alps_get_placement_info returned %d first pe on node is %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), place_ret, orted_layout.firstPe)); + *first_rank = orted_layout.firstPe; + + fn_exit_w_lock: + lli_ret = alps_app_lli_unlock(); + if (ALPS_APP_LLI_ALPS_STAT_OK != lli_ret) { + OPAL_OUTPUT_VERBOSE((20, orte_ess_base_framework.framework_output, + "%s ess:alps: alps_app_lli_unlock returned %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), lli_ret)); + ret = ORTE_ERR_FILE_WRITE_FAILURE; + } + + fn_exit: + return ret; +} + +/* + * Function to check in with apshepherd to say we are a parallel application + */ +int +orte_ess_alps_sync_start(void) +{ + int ret = ORTE_SUCCESS; + int lli_ret = 0; + int alps_status = 0; + size_t alps_count; + + lli_ret = alps_app_lli_lock(); + if (0 != ret) { + OPAL_OUTPUT_VERBOSE((20, orte_ess_base_framework.framework_output, + "%s ess:alps: alps_app_lli_lock returned %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), lli_ret)); + ret = ORTE_ERR_FILE_WRITE_FAILURE; + goto fn_exit; + } + + lli_ret = alps_app_lli_put_request(ALPS_APP_LLI_ALPS_REQ_START, NULL, 0); + if (ALPS_APP_LLI_ALPS_STAT_OK != lli_ret) { + OPAL_OUTPUT_VERBOSE((20, orte_ess_base_framework.framework_output, + "%s ess:alps: alps_app_lli_put_request returned %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), lli_ret)); + ret = ORTE_ERR_FILE_WRITE_FAILURE; + goto fn_exit_w_lock; + } + + lli_ret = alps_app_lli_get_response (&alps_status, &alps_count); + if (ALPS_APP_LLI_ALPS_STAT_OK != alps_status) { + OPAL_OUTPUT_VERBOSE((20, orte_ess_base_framework.framework_output, + "%s ess:alps: alps_app_lli_get_response returned %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), alps_status)); + ret = ORTE_ERR_FILE_READ_FAILURE; + goto fn_exit_w_lock; + } + + fn_exit_w_lock: + lli_ret = alps_app_lli_unlock(); + if (ALPS_APP_LLI_ALPS_STAT_OK != lli_ret) { + OPAL_OUTPUT_VERBOSE((20, orte_ess_base_framework.framework_output, + "%s ess:alps: alps_app_lli_unlock returned %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), lli_ret)); + ret = ORTE_ERR_FILE_WRITE_FAILURE; + } + + fn_exit: + return ret; +} + +/* + * Function to check in with apshepherd to say we are a parallel application + */ + +int +orte_ess_alps_sync_complete(void) +{ + int ret = ORTE_SUCCESS; + int lli_ret = 0; + int alps_status = 0; + size_t alps_count; + + lli_ret = alps_app_lli_lock(); + if (0 != ret) { + OPAL_OUTPUT_VERBOSE((20, orte_ess_base_framework.framework_output, + "%s ess:alps: alps_app_lli_lock returned %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), lli_ret)); + ret = ORTE_ERR_FILE_WRITE_FAILURE; + goto fn_exit; + } + + lli_ret = alps_app_lli_put_request(ALPS_APP_LLI_ALPS_REQ_EXITING, NULL, 0); + if (ALPS_APP_LLI_ALPS_STAT_OK != lli_ret) { + OPAL_OUTPUT_VERBOSE((20, orte_ess_base_framework.framework_output, + "%s ess:alps: alps_app_lli_put_request returned %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), lli_ret)); + ret = ORTE_ERR_FILE_WRITE_FAILURE; + goto fn_exit_w_lock; + } + + lli_ret = alps_app_lli_get_response (&alps_status, &alps_count); + if (ALPS_APP_LLI_ALPS_STAT_OK != alps_status) { + OPAL_OUTPUT_VERBOSE((20, orte_ess_base_framework.framework_output, + "%s ess:alps: alps_app_lli_get_response returned %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), alps_status)); + ret = ORTE_ERR_FILE_READ_FAILURE; + goto fn_exit_w_lock; + } + + fn_exit_w_lock: + lli_ret = alps_app_lli_unlock(); + if (ALPS_APP_LLI_ALPS_STAT_OK != lli_ret) { + OPAL_OUTPUT_VERBOSE((20, orte_ess_base_framework.framework_output, + "%s ess:alps: alps_app_lli_unlock returned %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), lli_ret)); + ret = ORTE_ERR_FILE_WRITE_FAILURE; + } + + fn_exit: + return ret; +} + + From e0487e7702ecca93687527e6b5a4c5f53d952c7f Mon Sep 17 00:00:00 2001 From: Howard Pritchard Date: Sat, 22 Nov 2014 12:51:56 -0800 Subject: [PATCH 2/7] orte/common/alps: add an alps common lib to orte Add an alps common lib to orte. Add a function to determine whether or not a process is in a PAGG container. Note: we need a better naming convention for common libs, since right now they use a "flat" naming convention. --- VERSION | 3 + configure.ac | 1 + orte/mca/common/alps/Makefile.am | 66 ++++++++++++++++++++++ orte/mca/common/alps/common_alps.c | 72 ++++++++++++++++++++++++ orte/mca/common/alps/common_alps.h | 32 +++++++++++ orte/mca/common/alps/configure.m4 | 41 ++++++++++++++ orte/mca/ess/alps/Makefile.am | 3 +- orte/mca/ess/alps/ess_alps_component.c | 28 ++------- orte/mca/odls/alps/Makefile.am | 3 +- orte/mca/odls/alps/odls_alps_component.c | 29 ++-------- 10 files changed, 231 insertions(+), 47 deletions(-) create mode 100644 orte/mca/common/alps/Makefile.am create mode 100644 orte/mca/common/alps/common_alps.c create mode 100644 orte/mca/common/alps/common_alps.h create mode 100644 orte/mca/common/alps/configure.m4 diff --git a/VERSION b/VERSION index 7a245a4051..88ba107f5f 100644 --- a/VERSION +++ b/VERSION @@ -107,3 +107,6 @@ libmca_common_verbs_so_version=0:0:0 # OPAL layer libmca_opal_common_pmi_so_version=0:0:0 + +# ORTE layer +libmca_common_alps_so_version=0:0:0 diff --git a/configure.ac b/configure.ac index 8212fcbf42..87095309fa 100644 --- a/configure.ac +++ b/configure.ac @@ -157,6 +157,7 @@ AC_SUBST(libmca_common_ofacm_so_version) AC_SUBST(libmca_common_sm_so_version) AC_SUBST(libmca_common_ugni_so_version) AC_SUBST(libmca_common_verbs_so_version) +AC_SUBST(libmca_common_alps_so_version) # # Get the versions of the autotools that were used to bootstrap us diff --git a/orte/mca/common/alps/Makefile.am b/orte/mca/common/alps/Makefile.am new file mode 100644 index 0000000000..f7e8be173d --- /dev/null +++ b/orte/mca/common/alps/Makefile.am @@ -0,0 +1,66 @@ +# +# Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. +# Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. +# Copyright (c) 2012-2014 Cisco Systems, Inc. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +AM_CPPFLAGS = $(common_alps_CPPFLAGS) + +headers = \ + common_alps.h + +sources = \ + common_alps.c + + +lib_LTLIBRARIES = +noinst_LTLIBRARIES = +comp_inst = lib@ORTE_LIB_PREFIX@mca_common_alps.la +comp_noinst = lib@ORTE_LIB_PREFIX@mca_common_alps_noinst.la + +if MCA_BUILD_orte_common_alps_DSO +lib_LTLIBRARIES += $(comp_inst) +else +noinst_LTLIBRARIES += $(comp_noinst) +endif + +lib@ORTE_LIB_PREFIX@mca_common_alps_la_SOURCES = $(headers) $(sources) +lib@ORTE_LIB_PREFIX@mca_common_alps_la_CPPFLAGS = $(common_alps_CPPFLAGS) +lib@ORTE_LIB_PREFIX@mca_common_alps_la_LDFLAGS = \ + -version-info $(libmca_common_alps_so_version) \ + $(common_alps_LDFLAGS) +lib@ORTE_LIB_PREFIX@mca_common_alps_la_LIBADD = $(common_alps_LIBS) +lib@ORTE_LIB_PREFIX@mca_common_alps_noinst_la_SOURCES = $(headers) $(sources) + +# Conditionally install the header files + +if WANT_INSTALL_HEADERS +opaldir = $(opalincludedir)/opal/mca/common/common_alps.h +opal_HEADERS = $(headers) +else +opaldir = $(includedir) +endif + +# These two rules will sym link the "noinst" libtool library filename +# to the installable libtool library filename in the case where we are +# compiling this component statically (case 2), described above). +V=0 +OMPI_V_LN_SCOMP = $(ompi__v_LN_SCOMP_$V) +ompi__v_LN_SCOMP_ = $(ompi__v_LN_SCOMP_$AM_DEFAULT_VERBOSITY) +ompi__v_LN_SCOMP_0 = @echo " LN_S " `basename $(comp_inst)`; + +all-local: + $(OMPI_V_LN_SCOMP) if test -z "$(lib_LTLIBRARIES)"; then \ + rm -f "$(comp_inst)"; \ + $(LN_S) "$(comp_noinst)" "$(comp_inst)"; \ + fi + +clean-local: + if test -z "$(lib_LTLIBRARIES)"; then \ + rm -f "$(comp_inst)"; \ + fi diff --git a/orte/mca/common/alps/common_alps.c b/orte/mca/common/alps/common_alps.c new file mode 100644 index 0000000000..3daad6f42a --- /dev/null +++ b/orte/mca/common/alps/common_alps.c @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2006 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2014 Los Alamos National Security, LLC. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + * + * These symbols are in a file by themselves to provide nice linker + * semantics. Since linkers generally pull in symbols by object + * files, keeping these symbols as the only symbols in this file + * prevents utility programs such as "ompi_info" from having to import + * entire components just to query their version and parameters. + */ + +#include "opal/types.h" + +#include "orte_config.h" +#include "orte/constants.h" +#include "orte/mca/common/alps/common_alps.h" + +#include +#include + + +/* + * determine whether or not calling process is in a Cray PAGG container + */ + +int orte_common_alps_proc_in_pagg(bool *flag) +{ + int rc = ORTE_SUCCESS; + const char proc_job_file[]="/proc/job"; + FILE *fd = NULL, *fd_task_is_app = NULL; + char task_is_app_fname[PATH_MAX]; + + if (flag == NULL) { + return ORTE_ERR_BAD_PARAM; + } + + fd = fopen(proc_job_file, "r"); + if (fd == NULL) { + *flag = 0; + } else { + snprintf(task_is_app_fname,sizeof(task_is_app_fname), + "/proc/self/task/%ld/task_is_app",syscall(SYS_gettid)); + fd_task_is_app = fopen(task_is_app_fname, "r"); + if (fd_task_is_app != NULL) { /* okay we're in a PAGG container, + and we are an app task (not just a process + running on a mom node, for example), */ + *flag = 1; + fclose(fd_task_is_app); + } else { + *flag = 0; + } + fclose(fd); + } + + return rc; +} + diff --git a/orte/mca/common/alps/common_alps.h b/orte/mca/common/alps/common_alps.h new file mode 100644 index 0000000000..21333f733d --- /dev/null +++ b/orte/mca/common/alps/common_alps.h @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. + * All rights reserved. + * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. + * Copyright (c) 2012-2014 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2014 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef _COMMON_ALPS_H_ +#define _COMMON_ALPS_H_ + +#include "opal_config.h" + +BEGIN_C_DECLS + +/** + * Determine if calling process is in a Cray PAGG job container. + * flag set to TRUE if the process is in a PAGG, otherwise FALSE. + */ +OPAL_DECLSPEC int orte_common_alps_proc_in_pagg(bool *flag); + +END_C_DECLS + +#endif + diff --git a/orte/mca/common/alps/configure.m4 b/orte/mca/common/alps/configure.m4 new file mode 100644 index 0000000000..e0ac065544 --- /dev/null +++ b/orte/mca/common/alps/configure.m4 @@ -0,0 +1,41 @@ +# -*- shell-script -*- +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2007-2012 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. +# Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# MCA_orte_common_alps_CONFIG([action-if-can-compile], +# [action-if-cant-compile]) +# ------------------------------------------------ +AC_DEFUN([MCA_orte_common_alps_CONFIG],[ + AC_CONFIG_FILES([orte/mca/common/alps/Makefile]) + common_verbs_happy="no" + OPAL_CHECK_OPENFABRICS([common_verbs], + [common_verbs_happy="yes"]) + + AS_IF([test "$common_verbs_happy" = "yes"], + [$1], + [$2]) + + # substitute in the things needed to build openib + AC_SUBST([common_alps_CFLAGS]) + AC_SUBST([common_alps_CPPFLAGS]) + AC_SUBST([common_alps_LDFLAGS]) + AC_SUBST([common_alps_LIBS]) +])dnl diff --git a/orte/mca/ess/alps/Makefile.am b/orte/mca/ess/alps/Makefile.am index e647127d8a..36d32fd900 100644 --- a/orte/mca/ess/alps/Makefile.am +++ b/orte/mca/ess/alps/Makefile.am @@ -40,7 +40,8 @@ mcacomponent_LTLIBRARIES = $(component_install) mca_ess_alps_la_SOURCES = $(sources) mca_ess_alps_la_CPPFLAGS = $(ess_alps_CPPFLAGS) -fno-ident mca_ess_alps_la_LDFLAGS = -module -avoid-version $(ess_alps_LDFLAGS) -mca_ess_alps_la_LIBADD = $(ess_alps_LDFLAGS) +mca_ess_alps_la_LIBADD = $(ess_alps_LDFLAGS) \ + $(ORTE_TOP_BUILDDIR)/orte/mca/common/alps/lib@ORTE_LIB_PREFIX@mca_common_alps.la noinst_LTLIBRARIES = $(component_noinst) libmca_ess_alps_la_SOURCES =$(sources) diff --git a/orte/mca/ess/alps/ess_alps_component.c b/orte/mca/ess/alps/ess_alps_component.c index 5f2f8c3a30..830f532776 100644 --- a/orte/mca/ess/alps/ess_alps_component.c +++ b/orte/mca/ess/alps/ess_alps_component.c @@ -28,6 +28,7 @@ #include "orte/constants.h" #include "orte/runtime/orte_globals.h" #include "orte/util/proc_info.h" +#include "orte/mca/common/alps/common_alps.h" #include "orte/mca/ess/ess.h" #include "orte/mca/ess/base/base.h" #include "orte/mca/ess/alps/ess_alps.h" @@ -71,9 +72,7 @@ orte_ess_alps_component_open(void) int orte_ess_alps_component_query(mca_base_module_t **module, int *priority) { int rc = ORTE_SUCCESS; - const char proc_job_file[]="/proc/job"; - FILE *fd = NULL, *fd_task_is_app = NULL; - char task_is_app_fname[PATH_MAX]; + bool flag; /* * don't use the alps ess component if an app proc @@ -91,25 +90,10 @@ int orte_ess_alps_component_query(mca_base_module_t **module, int *priority) * the cray job kernel module - the thing that creates the PAGG) */ - /* disqualify ourselves if not running in a Cray PAGG container */ - fd = fopen(proc_job_file, "r"); - if (fd == NULL) { - *priority = 0; - *module = NULL; - rc = ORTE_ERROR; - } else { - snprintf(task_is_app_fname,sizeof(task_is_app_fname), - "/proc/self/task/%ld/task_is_app",syscall(SYS_gettid)); - fd_task_is_app = fopen(task_is_app_fname, "r"); - if (fd_task_is_app != NULL) { /* okay we're in a PAGG container, - and we are an app task (not just a process - running on a mom node, for example), - so we should give cray pmi a shot. */ - *priority = 35; /* take precendence over base */ - *module = (mca_base_module_t *) &orte_ess_alps_module; - fclose(fd_task_is_app); - } - fclose(fd); + rc = orte_common_alps_proc_in_pagg(&flag); + if ((ORTE_SUCCESS == rc) && flag) { + *priority = 35; /* take precendence over base */ + *module = (mca_base_module_t *) &orte_ess_alps_module; } return rc; diff --git a/orte/mca/odls/alps/Makefile.am b/orte/mca/odls/alps/Makefile.am index 3c907144e6..145f5a384e 100644 --- a/orte/mca/odls/alps/Makefile.am +++ b/orte/mca/odls/alps/Makefile.am @@ -44,7 +44,8 @@ mcacomponent_LTLIBRARIES = $(component_install) mca_odls_alps_la_SOURCES = $(sources) mca_odls_alps_la_CPPFLAGS = $(odls_alps_CPPFLAGS) mca_odls_alps_la_LDFLAGS = -module -avoid-version $(odls_alps_LDFLAGS) -mca_odls_alps_la_LIBADD = $(odls_alps_LIBS) +mca_odls_alps_la_LIBADD = $(odls_alps_LIBS) \ + $(ORTE_TOP_BUILDDIR)/orte/mca/common/alps/lib@ORTE_LIB_PREFIX@mca_common_alps.la noinst_LTLIBRARIES = $(component_noinst) libmca_odls_alps_la_SOURCES =$(sources) diff --git a/orte/mca/odls/alps/odls_alps_component.c b/orte/mca/odls/alps/odls_alps_component.c index a19f276e7e..b5dc275879 100644 --- a/orte/mca/odls/alps/odls_alps_component.c +++ b/orte/mca/odls/alps/odls_alps_component.c @@ -37,6 +37,7 @@ #include "opal/mca/mca.h" #include "opal/mca/base/base.h" +#include "orte/mca/common/alps/common_alps.h" #include "orte/mca/odls/odls.h" #include "orte/mca/odls/base/odls_private.h" #include "orte/mca/odls/alps/odls_alps.h" @@ -77,9 +78,7 @@ int orte_odls_alps_component_open(void) int orte_odls_alps_component_query(mca_base_module_t **module, int *priority) { int rc = ORTE_SUCCESS; - const char proc_job_file[]="/proc/job"; - FILE *fd = NULL, *fd_task_is_app = NULL; - char task_is_app_fname[PATH_MAX]; + bool flag; /* * make sure we're in a daemon process @@ -97,26 +96,10 @@ int orte_odls_alps_component_query(mca_base_module_t **module, int *priority) * the cray job kernel module - the thing that creates the PAGG */ - /* disqualify ourselves if not running in a Cray PAGG container */ - fd = fopen(proc_job_file, "r"); - if (fd == NULL) { - *priority = 0; - *module = NULL; - rc = ORTE_ERROR; - } else { - snprintf(task_is_app_fname,sizeof(task_is_app_fname), - "/proc/self/task/%ld/task_is_app",syscall(SYS_gettid)); - fd_task_is_app = fopen(task_is_app_fname, "r"); - if (fd_task_is_app != NULL) { /* okay we're in a PAGG container, - and we are an app task (not just a process - running on a mom node, for example), - so we should give cray pmi a shot. */ - *priority = 10; /* take precendence over base */ - *module = (mca_base_module_t *) &orte_odls_alps_module; - fclose(fd_task_is_app); - rc = orte_odls_alps_get_rdma_creds(); - } - fclose(fd); + rc = orte_common_alps_proc_in_pagg(&flag); + if ((ORTE_SUCCESS == rc) && flag) { + *priority = 10; /* take precendence over base */ + *module = (mca_base_module_t *) &orte_odls_alps_module; } return rc; From d749077e1e197d68b6f787e4e69e11406ca29868 Mon Sep 17 00:00:00 2001 From: Howard Pritchard Date: Mon, 24 Nov 2014 09:01:32 -0800 Subject: [PATCH 3/7] odls/alps: make sure PMI env. variables set up Add call to orte_odls_alps_get_rdma_creds in the local proc launch step to obtain the Cray Rdma credentials from the apshepherd, and to set the PMI env. variables expected by uGNI BTL, etc. --- orte/mca/odls/alps/odls_alps_module.c | 9 +++++++++ orte/mca/odls/alps/odls_alps_utils.c | 6 ------ 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/orte/mca/odls/alps/odls_alps_module.c b/orte/mca/odls/alps/odls_alps_module.c index c4007d2cf3..50dbdf72a8 100644 --- a/orte/mca/odls/alps/odls_alps_module.c +++ b/orte/mca/odls/alps/odls_alps_module.c @@ -740,6 +740,15 @@ int orte_odls_alps_launch_local_procs(opal_buffer_t *data) return rc; } + /* get the RDMA credentials and push them into the launch environment */ + + if (ORTE_SUCCESS != (rc = orte_odls_alps_get_rdma_creds())) {; + OPAL_OUTPUT_VERBOSE((2, orte_odls_base_framework.framework_output, + "%s odls:alps:launch:failed to get GNI rdma credentials %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_ERROR_NAME(rc))); + return rc; + } + /* launch the local procs */ ORTE_ACTIVATE_LOCAL_LAUNCH(job, odls_alps_fork_local_proc); diff --git a/orte/mca/odls/alps/odls_alps_utils.c b/orte/mca/odls/alps/odls_alps_utils.c index cc02225116..8236038d14 100644 --- a/orte/mca/odls/alps/odls_alps_utils.c +++ b/orte/mca/odls/alps/odls_alps_utils.c @@ -231,12 +231,6 @@ int orte_odls_alps_get_rdma_creds(void) goto fn_exit; } -#if 0 - fprintf(stderr,"apid = 0x%lx ptag0 %d cookie0 0x%x(%d) ptag1 %d cookie1 0x%x(%d)\n",apid, - rdmacred_buf[0].ptag,rdmacred_buf[0].cookie,rdmacred_buf[0].cookie, - rdmacred_buf[1].ptag,rdmacred_buf[1].cookie,rdmacred_buf[1].cookie); -#endif - } fn_exit: From 191fe0f9493728f8d6e8443b86b873fc8f352f57 Mon Sep 17 00:00:00 2001 From: Howard Pritchard Date: Tue, 25 Nov 2014 14:35:04 -0700 Subject: [PATCH 4/7] alps configury changes Clean up the orte_check_alps.m4. There was a little of unnecesary stuff for handling cle 5, since it wasn't actually doing the right thing, which would be to use pkg-config to find dependencies both for dynamic and static linking. Decouple the searching for alps libs, etc. from cray pmi. Switch the alps ess and alps odls components' config files to use the ALPS m4 macro. alps configury fixes Improve a check for detecting CLE release. Improve an error message. --- config/orte_check_alps.m4 | 216 +++++++++++++++++++------------- orte/mca/ess/alps/configure.m4 | 19 +-- orte/mca/odls/alps/configure.m4 | 20 +-- 3 files changed, 130 insertions(+), 125 deletions(-) diff --git a/config/orte_check_alps.m4 b/config/orte_check_alps.m4 index 5aed79385b..8160c95c26 100644 --- a/config/orte_check_alps.m4 +++ b/config/orte_check_alps.m4 @@ -17,112 +17,148 @@ # # $HEADER$ # +# ORTE_CHECK_ALPS_CLE4([action-if-found], [action-if-not-found]) +# -------------------------------------------------------- +AC_DEFUN([ORTE_CHECK_ALPS_CLE4],[ + +# +# if we've gotten here, its because we are building on a CLE 4 system +# + orte_check_alps_cle4_libdir_happy="no" + orte_check_alps_cle4_dir_happy="no" + + AC_MSG_CHECKING([Checking for ALPS components on a CLE 4 system with alps $with_alps]) + + AC_ARG_WITH([alps-libdir], + [AC_HELP_STRING([--with-alps-libdir=DIR], + [Location of alps libraries (alpslli, alpsutil) (default: /usr/lib/alps (/opt/cray/xe-sysroot/default/user on eslogin nodes))])]) + +# +# check to see if Open MPI is being built on a CLE 4 eslogin node +# + AS_IF([test -f /etc/opt/cray/release/ESLrelease], + [default_alps_dir="/opt/cray/xe-sysroot/default/usr"], + [default_alps_dir="/usr"]) + + AS_IF([test -z "$with_alps_libdir"], + [AS_IF([test "$with_alps" != "yes" -a "$with_alps" != "auto"], + [AS_IF([test -d "$with_alps_libdir/lib64"], + [orte_check_alps_libdir="$with_alps_libdir/lib64"], + [orte_check_alps_libdir="$with_alps_libdir/lib"])], + [ orte_check_alps_libdir="$default_alps_dir/lib/alps"]) + ],[]) + + AS_IF([test "$with_alps" = "yes" -o "$with_alps" = "auto"], + [orte_check_alps_dir=$default_alps_dir], + [orte_check_alps_dir=$with_alps]) + + AC_MSG_CHECKING([if $orte_check_alps_libdir/libalps.a is present]) + AS_IF([test -f "$orte_check_alps_libdir/libalps.a"], + [orte_check_alps_libdir_cle4_happy="yes"], + [orte_check_alps_libdir_cle4_happy="no", + AC_MSG_RESULT([no])]) + + AC_MSG_CHECKING([if $orte_check_alps_dir/include/alps/apInfo.h is present]) + AS_IF([test -f "$orte_check_alps_dir/include/alps/apInfo.h"], + [orte_check_alps_dir_cle4_happy="yes"], + [orte_check_alps_dir_cle4_happy="no" + AC_MSG_RESULT([no])]) + + AS_IF([test "$orte_check_alps_libdir_cle4_happy" = "yes" -a "$orte_check_alps_dir_cle4_happy" = "yes"], + [CRAY_ALPSLLI_CFLAGS="-I$orte_check_alps_dir/include" + CRAY_ALPSLLI_LIBS="-L$orte_check_alps_libdir -lalpslli -lalpsutil" + CRAY_ALPSLLI_STATIC_LIBS="-L$orte_check_alps_libdir -lalpslli -lalpsutil" + $1], + [$2]) +]) + + + # ORTE_CHECK_ALPS(prefix, [action-if-found], [action-if-not-found]) # -------------------------------------------------------- AC_DEFUN([ORTE_CHECK_ALPS],[ if test -z "$orte_check_alps_happy"; then - # require that we check for pmi support request first so - # we can get the static library ordering correct - AC_REQUIRE([OPAL_CHECK_CRAY_PMI]) AC_ARG_WITH([alps], [AC_HELP_STRING([--with-alps(=DIR|yes|no)], - [Build with ALPS scheduler component, optionally adding DIR/include, DIR/lib, and DIR/lib64 to the search path for headers and libraries (default: no)])]) - OPAL_CHECK_WITHDIR([alps], [$with_alps], [.]) + [Build with ALPS scheduler component, optionally adding DIR/include, DIR/lib, and DIR/lib64 to the search path for headers and libraries (default: auto)])],[],with_alps=auto) - AC_ARG_WITH([alps-libdir], - [AC_HELP_STRING([--with-alps-libdir=DIR], - [Location of alps libraries (alpslli, alpsutil) (default: /usr/lib/alps)])]) - - AC_ARG_WITH([wlm_detect], - [AC_HELP_STRING([--with-wlm_detect(=DIR)], - [Location of wlm_detect library needed by PMI on CLE 5 systems (default: /opt/cray/wlm_detect/default)])]) - - # save the CPPFLAGS so we can check for alps/apInfo.h without adding $with_alps/include to the global path - orte_check_alps_$1_save_CPPFLAGS="$CPPFLAGS" - - # - # check to see where alps is installed, it wandered to a new location starting with CLE 5.0 - # - - if test -f "/usr/lib/alps/libalps.a" ; then - using_cle5_install="no" + if test -f /etc/opt/cray/release/clerelease; then + cle_level=`awk -F. '{print [$]1}' /etc/opt/cray/release/clerelease` else - using_cle5_install="yes" - if test -z "$with_wlm_detect" ; then - with_wlm_detect="/opt/cray/wlm_detect/default" - fi - - # libpmi requires libugni for static linking on CLE 5. WTH! - OPAL_CHECK_UGNI($1,[orte_check_alps_happy=yes],[orte_check_alps_happy=no]) + cle_level="unknown" fi - if test "$with_alps" = "no" -o -z "$with_alps" ; then - orte_check_alps_happy="no" - else - # Only need to do these tests once - this macro is invoked - # from multiple different components' configure.m4 scripts + AC_MSG_CHECKING([for ALPS support cle level $cle_level]) + AS_IF([test "$cle_level" = "4" -a "$with_alps" != "no"], + [ORTE_CHECK_ALPS_CLE4([orte_check_cray_alps_happy="yes"], + [orte_check_cray_alps_happy="no"])], + [AS_IF([test "$with_alps" = "no"], + [AC_MSG_RESULT([no]) + $3], + [AS_IF([test "$with_alps" = "auto" -o "$with_alps" = "yes"], + [PKG_CHECK_MODULES_STATIC([CRAY_ALPSLLI], [cray-alpslli], + [orte_check_cray_alps_happy="yes"], + [orte_check_cray_alps_happy="no"] + [AS_IF([test "$with_alps" = "yes"], + [AC_MSG_WARN([ALPS support requested but pkg-config failed.]) + AC_MSG_WARN([Need to explicitly indicate ALPS directory]) + AC_MSG_WARN([on the configure line using --with-alps option.]) + AC_MSG_ERROR([Aborting])],[])] + ) + PKG_CHECK_MODULES_STATIC([CRAY_ALPSUTIL], [cray-alpsutil], + [orte_check_cray_alps_happy="yes"], + [orte_check_cray_alps_happy="no"] + [AS_IF([test "$with_alps" = "yes"], + [AC_MSG_WARN([ALPS support requested but pkg-config failed.]) + AC_MSG_WARN([Need to explicitly indicate ALPS directory]) + AC_MSG_WARN([on the configure line using --with-alps option.]) + AC_MSG_ERROR([Aborting])],[])] + ) - orte_check_alps_happy="yes" - orte_check_alps_libdir="$with_alps_libdir" - - if test -z "$orte_check_alps_libdir" ; then - if test "$with_alps" != "yes" ; then - AS_IF([test -d "$with_alps/lib64"], - [orte_check_alps_libdir="$with_alps/lib64"], - [orte_check_alps_libdir="$with_alps/lib"]) - else - if test "$using_cle5_install" = "yes"; then - orte_check_alps_libdir="/opt/cray/alps/default/lib64" - else - orte_check_alps_libdir="/usr/lib/alps" - fi - fi - fi + PKG_CHECK_MODULES_STATIC([CRAY_ALPS], [cray-alps], + [orte_check_cray_alps_happy="yes"], + [orte_check_cray_alps_happy="no"] + [AS_IF([test "$with_alps" = "yes"], + [AC_MSG_WARN([ALPS support requested but pkg-config failed.]) + AC_MSG_WARN([Need to explicitly indicate ALPS directory]) + AC_MSG_WARN([on the configure line using --with-alps option.]) + AC_MSG_ERROR([Aborting])],[])] + ) - if test "$with_alps" = "yes" ; then - AS_IF([test "$using_cle5_install" = "yes"], - [orte_check_alps_dir="/opt/cray/alps/default"], - [orte_check_alps_dir="/usr"]) - else - orte_check_alps_dir="$with_alps" - fi + ], + [AC_MSG_WARN([See ./configure --help for how to control Open MPI]) + AC_MSG_WARN([configuration for ALPS on CLE 5 and higher systems]) + AC_MSG_ERROR([Aborting])]) + ]) + ]) - if test -z "$orte_check_alps_pmi_happy"; then - # if pmi support is requested, then OPAL_CHECK_PMI - # will have added the -lpmi flag to LIBS. We then need - # to add a couple of alps libs to support static - # builds - if test "$opal_enable_pmi" = 1 ; then - AC_MSG_CHECKING([for alps libraries in "$orte_check_alps_libdir"]) + AC_MSG_RESULT([orte_check_cray_alps_happy = $orte_check_cray_alps_happy]) + + AS_IF([test "$orte_check_cray_alps_happy" = "yes" -a "$enable_static" = "yes"], + [CRAY_ALPSLLI_LIBS = $CRAY_ALPSLLI_STATIC_LIBS + CRAY_ALPSUTIL_LIBS = $CRAY_ALPSUTIL_STATIC_LIBS], + []) + + AC_MSG_RESULT([CRAY_ALPSLLI_STATIC_LIBS - $CRAY_ALPSLLI_STATIC_LIBS]) + AC_MSG_RESULT([CRAY_ALPSLLI_LIBS - $CRAY_ALPSLLI_LIBS]) + AC_MSG_RESULT([CRAY_ALPSLLI_CFLAGS - $CRAY_ALPSLLI_CFLAGS]) + + AC_MSG_RESULT([CRAY_ALPSUTIL_STATIC_LIBS - $CRAY_ALPSUTIL_STATIC_LIBS]) + AC_MSG_RESULT([CRAY_ALPSUTIL_LIBS - $CRAY_ALPSUTIL_LIBS]) + AC_MSG_RESULT([CRAY_ALPSUTIL_CFLAGS - $CRAY_ALPSUTIL_CFLAGS]) + + AC_MSG_RESULT([CRAY_ALPS_CFLAGS - $CRAY_ALPS_CFLAGS]) + + AS_IF([test "$orte_check_cray_alps_happy" = "yes"], + [$1_LDFLAGS="$CRAY_ALPSLLI_LIBS $CRAY_ALPSUTIL_LIBS" + $1_CPPFLAGS="$CRAY_ALPSLLI_CFLAGS $CRAY_ALPSUTIL_CFLAGS $CRAY_ALPS_CFLAGS" + $1_LIBS="$CRAY_ALPSLLI_LIBS $CRAY_ALPSUTIL_LIBS"], + []) - # libalpslli and libalpsutil are needed by libpmi to compile statically - AS_IF([test -f "$orte_check_alps_libdir/libalpslli.a" -a -f "$orte_check_alps_libdir/libalpsutil.a"], - [AC_MSG_RESULT([found]) - orte_check_alps_pmi_happy=yes], - [AC_MSG_WARN([PMI support for Alps requested but not found]) - AC_MSG_ERROR([Cannot continue])]) - fi - fi - fi fi - # Set LIBS, CPPFLAGS, and LDFLAGS here so they always get set - if test "$orte_check_alps_happy" = "yes" -a "$opal_enable_pmi" = 1 ; then - $1_LIBS="-lalpslli -lalpsutil" - fi - - $1_CPPFLAGS="-I$orte_check_alps_dir/include" - $1_LDFLAGS="-L$orte_check_alps_libdir" - - # Add CLE 5 library dependencies - if test "using_cle5_install" = "yes" ; then - $1_LIBS="$$1_LIBS -lwlm_detect" - $1_LDFLAGS="$$1_LDFLAGS -L$with_wlm_detect" - fi - - AS_IF([test "$orte_check_alps_happy" = "yes"], - [$2], - [$3]) + AS_IF([test "$orte_check_cray_alps_happy" = "yes"], + [$2], [$3]) ]) diff --git a/orte/mca/ess/alps/configure.m4 b/orte/mca/ess/alps/configure.m4 index ee6ecb35d8..6ad187ae3b 100644 --- a/orte/mca/ess/alps/configure.m4 +++ b/orte/mca/ess/alps/configure.m4 @@ -25,24 +25,9 @@ AC_DEFUN([MCA_orte_ess_alps_CONFIG],[ AC_CONFIG_FILES([orte/mca/ess/alps/Makefile]) - ess_alps_lli_happy="no" - ess_alps_util_happy="no" + ORTE_CHECK_ALPS([ess_alps], [ess_alps_happy="yes"], [ess_alps_happy="no"]) - PKG_CHECK_MODULES([CRAY_ALPS_LLI], [cray-alpslli], - [ess_alps_CPPFLAGS=$CRAY_ALPS_LLI_CFLAGS - ess_alps_LDFLAGS=$CRAY_ALPS_LLI_LIBS - ess_alps_LIBS=$CRAY_ALPS_LLI_LIBS - ess_alps_lli_happy="yes"], - [AC_MSG_RESULT([no])]) - - PKG_CHECK_MODULES([CRAY_ALPS_UTIL], [cray-alpsutil], - [ess_alps_CPPFLAGS="$ess_alps_CPPFLAGS $CRAY_ALPS_UTIL_CFLAGS" - ess_alps_LDFLAGS="$ess_alps_LDFLAGS $CRAY_ALPS_UTIL_LIBS" - ess_alps_LIBS="$ess_alps_LIBS $CRAY_ALPS_LLI_LIBS" - ess_alps_util_happy="yes"], - [AC_MSG_RESULT([no])]) - - AS_IF([test "$ess_alps_lli_happy" = "yes" -a "$ess_alps_util_happy" = "yes"], + AS_IF([test "$ess_alps_happy" = "yes"], [$1 AC_SUBST([ess_alps_CPPFLAGS]) AC_SUBST([ess_alps_LDFLAGS]) diff --git a/orte/mca/odls/alps/configure.m4 b/orte/mca/odls/alps/configure.m4 index a5d710c1d8..b8b7bad20f 100644 --- a/orte/mca/odls/alps/configure.m4 +++ b/orte/mca/odls/alps/configure.m4 @@ -25,29 +25,13 @@ AC_DEFUN([MCA_orte_odls_alps_CONFIG],[ AC_CONFIG_FILES([orte/mca/odls/alps/Makefile]) - odls_alps_lli_happy="no" - odls_alps_util_happy="no" + ORTE_CHECK_ALPS([odls_alps], [odls_alps_happy="yes"], [odls_alps_happy="no"]) - PKG_CHECK_MODULES([CRAY_ALPS_LLI], [cray-alpslli], - [odls_alps_CPPFLAGS=$CRAY_ALPS_LLI_CFLAGS - odls_alps_LDFLAGS=$CRAY_ALPS_LLI_LIBS - odls_alps_LIBS=$CRAY_ALPS_LLI_LIBS - odls_alps_lli_happy="yes"], - [AC_MSG_RESULT([no])]) - - PKG_CHECK_MODULES([CRAY_ALPS_UTIL], [cray-alpsutil], - [odls_alps_CPPFLAGS="$odls_alps_CPPFLAGS $CRAY_ALPS_UTIL_CFLAGS" - odls_alps_LDFLAGS="$odls_alps_LDFLAGS $CRAY_ALPS_UTIL_LIBS" - odls_alps_LIBS="$odls_alps_LIBS $CRAY_ALPS_LLI_LIBS" - odls_alps_util_happy="yes"], - [AC_MSG_RESULT([no])]) - - AS_IF([test "$odls_alps_lli_happy" = "yes" -a "$odls_alps_util_happy" = "yes"], + AS_IF([test "$odls_alps_happy" = "yes"], [$1 AC_SUBST([odls_alps_CPPFLAGS]) AC_SUBST([odls_alps_LDFLAGS]) AC_SUBST([odls_alps_LIBS])], [$2]) - ])dnl From ec38aa37323f22193318d81a845fe971b6ec9a99 Mon Sep 17 00:00:00 2001 From: Howard Pritchard Date: Wed, 26 Nov 2014 09:42:31 -0700 Subject: [PATCH 5/7] orte/mca/common: add missing Makefile.am --- orte/mca/common/Makefile.am | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 orte/mca/common/Makefile.am diff --git a/orte/mca/common/Makefile.am b/orte/mca/common/Makefile.am new file mode 100644 index 0000000000..33bbb5f2a3 --- /dev/null +++ b/orte/mca/common/Makefile.am @@ -0,0 +1,25 @@ +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# Note that this file must exist, even though it is empty (there is no +# "base" directory for the common framework). autogen.pl and +# opal_mca.m4 assume that every framework has a top-level Makefile.am. +# We *could* adjust the framework glue code to exclude "common" from +# this requirement, but it's just a lot easier to have an empty +# Makefile.am here. From 666344a081527e4e19323cb9894cf0ca66b97ff9 Mon Sep 17 00:00:00 2001 From: Howard Pritchard Date: Wed, 26 Nov 2014 12:21:28 -0700 Subject: [PATCH 6/7] orte/mca/common/alps: fix configure file Fix configure file for alps to actually check for alps being available. Also include stdio.h explicitly in common_alps.c --- orte/mca/common/alps/common_alps.c | 1 + orte/mca/common/alps/configure.m4 | 21 ++++++++++----------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/orte/mca/common/alps/common_alps.c b/orte/mca/common/alps/common_alps.c index 3daad6f42a..6fd77ad054 100644 --- a/orte/mca/common/alps/common_alps.c +++ b/orte/mca/common/alps/common_alps.c @@ -30,6 +30,7 @@ #include "orte/constants.h" #include "orte/mca/common/alps/common_alps.h" +#include #include #include diff --git a/orte/mca/common/alps/configure.m4 b/orte/mca/common/alps/configure.m4 index e0ac065544..519dd23b0d 100644 --- a/orte/mca/common/alps/configure.m4 +++ b/orte/mca/common/alps/configure.m4 @@ -13,6 +13,8 @@ # Copyright (c) 2007-2012 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. # Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. +# Copyright (c) 2014 Los Alamos National Security, LLC. +# All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -25,17 +27,14 @@ # ------------------------------------------------ AC_DEFUN([MCA_orte_common_alps_CONFIG],[ AC_CONFIG_FILES([orte/mca/common/alps/Makefile]) - common_verbs_happy="no" - OPAL_CHECK_OPENFABRICS([common_verbs], - [common_verbs_happy="yes"]) - AS_IF([test "$common_verbs_happy" = "yes"], - [$1], + ORTE_CHECK_ALPS([common_alps], [common_alps_happy="yes"], [common_alps_happy="no"]) + + AS_IF([test "$common_alps_happy" = "yes"], + [$1 + AC_SUBST([common_alps_CPPFLAGS]) + AC_SUBST([common_alps_LDFLAGS]) + AC_SUBST([common_alps_LIBS])], [$2]) - - # substitute in the things needed to build openib - AC_SUBST([common_alps_CFLAGS]) - AC_SUBST([common_alps_CPPFLAGS]) - AC_SUBST([common_alps_LDFLAGS]) - AC_SUBST([common_alps_LIBS]) + # ])dnl From c75dccede192e8e2126eced4475c7f4f2fe1d320 Mon Sep 17 00:00:00 2001 From: Howard Pritchard Date: Tue, 2 Dec 2014 14:46:36 -0700 Subject: [PATCH 7/7] pmix/cray: remove finalize call from comp close The finalize call in component close method is no longer being matched by an equivalent init call, so remove this call in the close method. --- opal/mca/pmix/cray/pmix_cray_component.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/opal/mca/pmix/cray/pmix_cray_component.c b/opal/mca/pmix/cray/pmix_cray_component.c index 8027d54f96..21a9fd658a 100644 --- a/opal/mca/pmix/cray/pmix_cray_component.c +++ b/opal/mca/pmix/cray/pmix_cray_component.c @@ -105,10 +105,6 @@ static int pmix_cray_component_query(mca_base_module_t **module, int *priority) static int pmix_cray_component_close(void) { - int ret = OPAL_SUCCESS; - - ret = opal_pmix_cray_module.finalize(); - - return ret; + return OPAL_SUCCESS; }