From 3c914a7a979273cf5b2bc5bac4e4608880d8af87 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Fri, 15 Sep 2017 06:50:58 -0700 Subject: [PATCH] Complete the fix of the ORTE DVM. We will now use "prun" instead of "orterun -hnp foo" to execute jobs. This provides the feature of automatic discovery of the orte-dvm so you don't need to manually enter URI's or contact file locations. All IO is forwarded to prun. Still in the "needs to be done" category: * mapping/ranking/binding options aren't correctly supported * if the DVM encounters some errors (e.g., not enough resources for the job), the resulting error is globally set and impacts any subsequent job submission Signed-off-by: Ralph Castain --- ompi/mca/rte/orte/Makefile.am | 8 +- opal/dss/dss_pack.c | 5 +- opal/dss/dss_unpack.c | 12 +- opal/mca/pmix/base/pmix_base_frame.c | 2 + opal/mca/pmix/ext2x/configure.m4 | 11 +- opal/mca/pmix/pmix.h | 18 ++ opal/mca/pmix/pmix2x/configure.m4 | 11 +- opal/mca/pmix/pmix2x/pmix/VERSION | 4 +- opal/mca/pmix/pmix2x/pmix/contrib/pmix.spec | 112 ++++----- .../pmix2x/pmix/src/class/pmix_value_array.c | 2 +- .../base/pmix_mca_base_component_compare.c | 2 +- .../pmix/src/mca/bfrops/base/bfrop_base_fns.c | 8 +- .../src/mca/pinstalldirs/config/configure.m4 | 2 +- .../src/mca/pinstalldirs/env/configure.m4 | 2 +- .../pmix/pmix2x/pmix/src/util/pmix_environ.c | 2 +- opal/mca/pmix/pmix2x/pmix/src/util/strnlen.h | 2 +- opal/mca/pmix/pmix2x/pmix2x.c | 21 +- opal/mca/pmix/pmix2x/pmix2x_client.c | 24 +- opal/runtime/opal_init.c | 2 +- orte/mca/ess/alps/ess_alps_module.c | 2 +- orte/mca/ess/base/base.h | 2 +- orte/mca/ess/base/ess_base_std_tool.c | 230 +++++++++++------- orte/mca/ess/lsf/ess_lsf_module.c | 2 +- orte/mca/ess/slurm/ess_slurm_module.c | 2 +- orte/mca/ess/tm/ess_tm_module.c | 2 +- orte/mca/ess/tool/ess_tool.h | 5 +- orte/mca/ess/tool/ess_tool_component.c | 27 +- orte/mca/ess/tool/ess_tool_module.c | 13 +- orte/mca/iof/iof.h | 46 ++-- orte/mca/plm/base/plm_base_launch_support.c | 8 +- orte/orted/pmix/pmix_server_dyn.c | 3 + orte/orted/pmix/pmix_server_gen.c | 7 + orte/tools/Makefile.am | 2 - orte/tools/orte-dvm/orte-dvm.c | 19 +- orte/tools/prun/Makefile.am | 8 +- orte/tools/prun/prun.c | 176 ++++++-------- 36 files changed, 436 insertions(+), 368 deletions(-) diff --git a/ompi/mca/rte/orte/Makefile.am b/ompi/mca/rte/orte/Makefile.am index 1b9ae9d6c4..80e07c30df 100644 --- a/ompi/mca/rte/orte/Makefile.am +++ b/ompi/mca/rte/orte/Makefile.am @@ -47,10 +47,8 @@ install-exec-hook: (cd $(DESTDIR)$(bindir); rm -f ompi-top$(EXEEXT); $(LN_S) orte-top$(EXEEXT) ompi-top$(EXEEXT)) (cd $(DESTDIR)$(bindir); rm -f ompi-server$(EXEEXT); $(LN_S) orte-server$(EXEEXT) ompi-server$(EXEEXT)) if OPAL_WANT_PRUN -if WANT_INSTALL_HEADERS (cd $(DESTDIR)$(bindir); rm -f ompi-dvm$(EXEEXT); $(LN_S) orte-dvm$(EXEEXT) ompi-dvm$(EXEEXT)) endif -endif uninstall-local: rm -f $(DESTDIR)$(bindir)/mpirun$(EXEEXT) \ @@ -60,9 +58,7 @@ uninstall-local: $(DESTDIR)$(bindir)/ompi-top$(EXEEXT) \ $(DESTDIR)$(bindir)/ompi-server$(EXEEXT) if OPAL_WANT_PRUN -if WANT_INSTALL_HEADERS - m -f $(DESTDIR)$(bindir)/ompi-dvm$(EXEEXT) -endif + rm -f $(DESTDIR)$(bindir)/ompi-dvm$(EXEEXT) endif endif # OPAL_INSTALL_BINARIES @@ -119,11 +115,9 @@ ompi-server.1: $(top_builddir)/orte/tools/orte-server/orte-server.1 cp -f $(top_builddir)/orte/tools/orte-server/orte-server.1 ompi-server.1 if OPAL_WANT_PRUN -if WANT_INSTALL_HEADERS ompi-dvm.1: $(top_builddir)/orte/tools/orte-dvm/orte-dvm.1 cp -f $(top_builddir)/orte/tools/orte-dvm/orte-dvm.1 ompi-dvm.1 endif -endif clean-local: rm -f $(man_pages) diff --git a/opal/dss/dss_pack.c b/opal/dss/dss_pack.c index 23c9d3b31b..87a7573a03 100644 --- a/opal/dss/dss_pack.c +++ b/opal/dss/dss_pack.c @@ -10,7 +10,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2011-2013 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2014-2016 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. * Copyright (c) 2014 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ @@ -837,7 +837,7 @@ int opal_dss_pack_value(opal_buffer_t *buffer, const void *src, } break; default: - opal_output(0, "PACK-OPAL-VALUE: UNSUPPORTED TYPE %d", (int)ptr[i]->type); + opal_output(0, "PACK-OPAL-VALUE: UNSUPPORTED TYPE %d FOR KEY %s", (int)ptr[i]->type, ptr[i]->key); return OPAL_ERROR; } } @@ -981,4 +981,3 @@ int opal_dss_pack_status(opal_buffer_t *buffer, const void *src, return ret; } - diff --git a/opal/dss/dss_unpack.c b/opal/dss/dss_unpack.c index be9993983c..212851bb94 100644 --- a/opal/dss/dss_unpack.c +++ b/opal/dss/dss_unpack.c @@ -11,7 +11,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2012-2015 Los Alamos National Security, Inc. All rights reserved. - * Copyright (c) 2014-2016 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. * Copyright (c) 2014-2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ @@ -1086,13 +1086,21 @@ int opal_dss_unpack_value(opal_buffer_t *buffer, void *dest, return ret; } break; + case OPAL_PTR: + /* just ignore these values */ + break; case OPAL_NAME: if (OPAL_SUCCESS != (ret = opal_dss_unpack_buffer(buffer, &ptr[i]->data.name, &m, OPAL_NAME))) { return ret; } break; + case OPAL_STATUS: + if (OPAL_SUCCESS != (ret = opal_dss_unpack_buffer(buffer, &ptr[i]->data.status, &m, OPAL_INT))) { + return ret; + } + break; default: - opal_output(0, "PACK-OPAL-VALUE: UNSUPPORTED TYPE"); + opal_output(0, "UNPACK-OPAL-VALUE: UNSUPPORTED TYPE %d FOR KEY %s", (int)ptr[i]->type, ptr[i]->key); return OPAL_ERROR; } } diff --git a/opal/mca/pmix/base/pmix_base_frame.c b/opal/mca/pmix/base/pmix_base_frame.c index eaec152edc..e06d18c0e3 100644 --- a/opal/mca/pmix/base/pmix_base_frame.c +++ b/opal/mca/pmix/base/pmix_base_frame.c @@ -84,6 +84,8 @@ static int opal_pmix_base_frame_open(mca_base_open_flag_t flags) rc = mca_base_framework_components_open(&opal_pmix_base_framework, flags); /* ensure the function pointers are NULL */ memset(&opal_pmix, 0, sizeof(opal_pmix)); + /* default to the OPAL event base */ + opal_pmix_base.evbase = opal_sync_event_base; /* pass across the verbosity */ opal_pmix_verbose_output = opal_pmix_base_framework.framework_output; return rc; diff --git a/opal/mca/pmix/ext2x/configure.m4 b/opal/mca/pmix/ext2x/configure.m4 index 1e27bace65..171f735f3b 100644 --- a/opal/mca/pmix/ext2x/configure.m4 +++ b/opal/mca/pmix/ext2x/configure.m4 @@ -48,16 +48,7 @@ AC_DEFUN([MCA_opal_pmix_ext2x_CONFIG],[ [$1 # need to set the wrapper flags for static builds pmix_ext2x_WRAPPER_EXTRA_LDFLAGS=$opal_external_pmix_LDFLAGS - pmix_ext2x_WRAPPER_EXTRA_LIBS=$opal_external_pmix_LIBS - # and the flags for prun - OPAL_PMIX_CPPFLAGS="-I$opal_external_pmix_CPPFLAGS" - AC_SUBST(OPAL_PMIX_CPPFLAGS) - OPAL_PMIX_LDFLAGS=$opal_external_pmix_LDFLAGS - AC_SUBST(OPAL_PMIX_LDFLAGS) - OPAL_PMIX_LDADD= - AC_SUBST(OPAL_PMIX_LDADD) - OPAL_PMIX_LIBS=-lpmix - AC_SUBST(OPAL_PMIX_LIBS)], + pmix_ext2x_WRAPPER_EXTRA_LIBS=$opal_external_pmix_LIBS], [$2])], [$2]) diff --git a/opal/mca/pmix/pmix.h b/opal/mca/pmix/pmix.h index ecd8349fe7..53e04571ab 100644 --- a/opal/mca/pmix/pmix.h +++ b/opal/mca/pmix/pmix.h @@ -852,6 +852,21 @@ typedef void (*opal_pmix_base_module_query_fn_t)(opal_list_t *queries, typedef void (*opal_pmix_base_log_fn_t)(opal_list_t *info, opal_pmix_op_cbfunc_t cbfunc, void *cbdata); +/* allocation */ +typedef int (*opal_pmix_base_alloc_fn_t)(opal_pmix_alloc_directive_t directive, + opal_list_t *info, + opal_pmix_info_cbfunc_t cbfunc, void *cbdata); + +/* job control */ +typedef int (*opal_pmix_base_job_control_fn_t)(opal_list_t *targets, + opal_list_t *directives, + opal_pmix_info_cbfunc_t cbfunc, void *cbdata); + +/* monitoring */ +typedef int (*opal_pmix_base_process_monitor_fn_t)(opal_list_t *monitor, + opal_list_t *directives, + opal_pmix_info_cbfunc_t cbfunc, void *cbdata); + /* * the standard public API data structure */ @@ -883,6 +898,9 @@ typedef struct { opal_pmix_base_module_resolve_nodes_fn_t resolve_nodes; opal_pmix_base_module_query_fn_t query; opal_pmix_base_log_fn_t log; + opal_pmix_base_alloc_fn_t allocate; + opal_pmix_base_job_control_fn_t job_control; + opal_pmix_base_process_monitor_fn_t monitor; /* server APIs */ opal_pmix_base_module_server_init_fn_t server_init; opal_pmix_base_module_server_finalize_fn_t server_finalize; diff --git a/opal/mca/pmix/pmix2x/configure.m4 b/opal/mca/pmix/pmix2x/configure.m4 index e94fd03bd0..372a2db43f 100644 --- a/opal/mca/pmix/pmix2x/configure.m4 +++ b/opal/mca/pmix/pmix2x/configure.m4 @@ -86,16 +86,7 @@ AC_DEFUN([MCA_opal_pmix_pmix2x_CONFIG],[ opal_pmix_pmix2x_LDFLAGS= opal_pmix_pmix2x_LIBS="$OPAL_TOP_BUILDDIR/$opal_pmix_pmix2x_basedir/pmix/src/libpmix.la" opal_pmix_pmix2x_CPPFLAGS="-I$OPAL_TOP_BUILDDIR/$opal_pmix_pmix2x_basedir/pmix/include -I$OPAL_TOP_BUILDDIR/$opal_pmix_pmix2x_basedir/pmix -I$OPAL_TOP_SRCDIR/$opal_pmix_pmix2x_basedir/pmix/include -I$OPAL_TOP_SRCDIR/$opal_pmix_pmix2x_basedir/pmix" - opal_pmix_pmix2x_DEPENDENCIES="$OPAL_TOP_BUILDDIR/$opal_pmix_pmix2x_basedir/pmix/src/libpmix.la" - # and the flags for prun - OPAL_PMIX_CPPFLAGS="$opal_pmix_pmix2x_CPPFLAGS" - AC_SUBST(OPAL_PMIX_CPPFLAGS) - OPAL_PMIX_LDADD=$opal_pmix_pmix2x_LIBS - AC_SUBST(OPAL_PMIX_LDADD) - OPAL_PMIX_LIBS= - AC_SUBST(OPAL_PMIX_LIBS) - OPAL_PMIX_LDFLAGS= - AC_SUBST(OPAL_PMIX_LDFLAGS)]) + opal_pmix_pmix2x_DEPENDENCIES="$OPAL_TOP_BUILDDIR/$opal_pmix_pmix2x_basedir/pmix/src/libpmix.la"]) AC_SUBST([opal_pmix_pmix2x_LIBS]) AC_SUBST([opal_pmix_pmix2x_CPPFLAGS]) diff --git a/opal/mca/pmix/pmix2x/pmix/VERSION b/opal/mca/pmix/pmix2x/pmix/VERSION index 987eeb5e9c..408462946c 100644 --- a/opal/mca/pmix/pmix2x/pmix/VERSION +++ b/opal/mca/pmix/pmix2x/pmix/VERSION @@ -30,7 +30,7 @@ greek= # command, or with the date (if "git describe" fails) in the form of # "date". -repo_rev=gitdcf4faf +repo_rev=git2389189 # If tarball_version is not empty, it is used as the version string in # the tarball filename, regardless of all other versions listed in @@ -44,7 +44,7 @@ tarball_version= # The date when this release was created -date="Sep 13, 2017" +date="Sep 14, 2017" # The shared library version of each of PMIx's public libraries. # These versions are maintained in accordance with the "Library diff --git a/opal/mca/pmix/pmix2x/pmix/contrib/pmix.spec b/opal/mca/pmix/pmix2x/pmix/contrib/pmix.spec index 236986114b..51378c2de6 100644 --- a/opal/mca/pmix/pmix2x/pmix/contrib/pmix.spec +++ b/opal/mca/pmix/pmix2x/pmix/contrib/pmix.spec @@ -9,10 +9,12 @@ # University of Stuttgart. All rights reserved. # Copyright (c) 2004-2005 The Regents of the University of California. # All rights reserved. -# Copyright (c) 2006-2014 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2006-2016 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2013 Mellanox Technologies, Inc. # All rights reserved. -# Copyright (c) 2015 Intel, Inc. All rights reserved. +# Copyright (c) 2015-2017 Intel, Inc. All rights reserved. +# Copyright (c) 2015 Research Organization for Information Science +# and Technology (RIST). All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -61,7 +63,7 @@ # type: string (root path to install shell scripts) %{!?shell_scripts_path: %define shell_scripts_path %{_bindir}} # type: string (base name of the shell scripts) -%{!?shell_scripts_basename: %define shell_scripts_basename mpivars} +%{!?shell_scripts_basename: %define shell_scripts_basename pmixvars} # Define this to 1 if you want this RPM to install a modulefile. # type: bool (0/1) @@ -78,17 +80,6 @@ # type: string (name of modules RPM) %{!?modules_rpm_name: %define modules_rpm_name environment-modules} -# Should we use the mpi-selector functionality? -# type: bool (0/1) -%{!?use_mpi_selector: %define use_mpi_selector 0} -# The name of the mpi-selector RPM. Can vary from system to system. -# type: string (name of mpi-selector RPM) -%{!?mpi_selector_rpm_name: %define mpi_selector_rpm_name mpi-selector} -# The location of the mpi-selector executable (can be a relative path -# name if "mpi-selector" can be found in the path) -# type: string (path to mpi-selector exectuable) -%{!?mpi_selector: %define mpi_selector mpi-selector} - # Should we build a debuginfo RPM or not? # type: bool (0/1) %{!?build_debuginfo_rpm: %define build_debuginfo_rpm 0} @@ -100,7 +91,7 @@ # Should we use the default "check_files" RPM step (i.e., check for # unpackaged files)? It is discouraged to disable this, but some # installers need it (e.g., older versions of OFED, because they -# installed lots of other stuff in the BUILD_ROOT before PMIx/SHMEM). +# installed lots of other stuff in the BUILD_ROOT before PMIx). # type: bool (0/1) %{!?use_check_files: %define use_check_files 1} @@ -125,7 +116,7 @@ # type: bool (0/1) %{!?disable_auto_requires: %define disable_auto_requires 0} -# On some platforms, PMIx/SHMEM just flat-out doesn't work with +# On some platforms, PMIx just flat-out doesn't work with # -D_FORTIFY_SOURCE (e.g., some users have reported that there are # problems on ioa64 platforms). In this case, just turn it off # (meaning: this specfile will strip out that flag from the @@ -152,7 +143,7 @@ %define _includedir /opt/%{name}/%{version}/include %define _mandir /opt/%{name}/%{version}/man # Note that the name "pmix" is hard-coded in -# opal/mca/installdirs/config for pkgdatadir; there is currently no +# src/mca/installdirs/config for pkgdatadir; there is currently no # easy way to have PMIx change this directory name internally. So we # just hard-code that name here as well (regardless of the value of # %{name} or %{_name}). @@ -162,6 +153,8 @@ # bets are off. So feel free to install it anywhere in your tree. He # suggests $prefix/doc. %define _defaultdocdir /opt/%{name}/%{version}/doc +# Also put the modulefile in /opt. +%define modulefile_path /opt/%{name}/%{version}/share/pmixmodulefiles %endif %if !%{build_debuginfo_rpm} @@ -191,10 +184,6 @@ %define optflags "" %endif -%if %{use_mpi_selector} -%define install_shell_scripts 1 -%endif - ############################################################################# # # Preamble Section @@ -212,7 +201,7 @@ Packager: %{?_packager:%{_packager}}%{!?_packager:%{_vendor}} Vendor: %{?_vendorinfo:%{_vendorinfo}}%{!?_vendorinfo:%{_vendor}} Distribution: %{?_distribution:%{_distribution}}%{!?_distribution:%{_vendor}} Prefix: %{_prefix} -Provides: mpi +Provides: pmix Provides: pmix = %{version} BuildRoot: /var/tmp/%{name}-%{version}-%{release}-root %if %{disable_auto_requires} @@ -221,9 +210,6 @@ AutoReq: no %if %{install_modulefile} Requires: %{modules_rpm_name} %endif -%if %{use_mpi_selector} -Requires: %{mpi_selector_rpm_name} -%endif %description The Process Management Interface (PMI) has been used for quite some time as a @@ -340,9 +326,8 @@ fi CFLAGS="%{?cflags:%{cflags}}%{!?cflags:$RPM_OPT_FLAGS}" CXXFLAGS="%{?cxxflags:%{cxxflags}}%{!?cxxflags:$RPM_OPT_FLAGS}" -FFLAGS="%{?f77flags:%{f77flags}}%{!?f7flags:$RPM_OPT_FLAGS}" FCFLAGS="%{?fcflags:%{fcflags}}%{!?fcflags:$RPM_OPT_FLAGS}" -export CFLAGS CXXFLAGS F77FLAGS FCFLAGS +export CFLAGS CXXFLAGS FCFLAGS %configure %{configure_options} %{__make} %{?mflags} @@ -369,14 +354,14 @@ cat <$RPM_BUILD_ROOT/%{modulefile_path}/%{modulefile_subdir}/%{modulefile_ #%Module # NOTE: This is an automatically-generated file! (generated by the -# PMIx/SHMEM RPM). Any changes made here will be lost a) if the RPM is +# PMIx RPM). Any changes made here will be lost a) if the RPM is # uninstalled, or b) if the RPM is upgraded or uninstalled. proc ModulesHelp { } { - puts stderr "This module adds PMIx/SHMEM v%{version} to various paths" + puts stderr "This module adds PMIx v%{version} to various paths" } -module-whatis "Sets up PMIx/SHMEM v%{version} in your enviornment" +module-whatis "Sets up PMIx v%{version} in your enviornment" prepend-path PATH "%{_prefix}/bin/" prepend-path LD_LIBRARY_PATH %{_libdir} @@ -391,7 +376,7 @@ EOF %{__mkdir_p} $RPM_BUILD_ROOT/%{shell_scripts_path} cat < $RPM_BUILD_ROOT/%{shell_scripts_path}/%{shell_scripts_basename}.sh # NOTE: This is an automatically-generated file! (generated by the -# PMIx/SHMEM RPM). Any changes made here will be lost if the RPM is +# PMIx RPM). Any changes made here will be lost if the RPM is # uninstalled or upgraded. # PATH @@ -412,13 +397,10 @@ if test -z "\`echo \$MANPATH | grep %{_mandir}\`"; then export MANPATH fi -# MPI_ROOT -MPI_ROOT=%{_prefix} -export MPI_ROOT EOF cat < $RPM_BUILD_ROOT/%{shell_scripts_path}/%{shell_scripts_basename}.csh # NOTE: This is an automatically-generated file! (generated by the -# PMIx/SHMEM RPM). Any changes made here will be lost if the RPM is +# PMIx RPM). Any changes made here will be lost if the RPM is # uninstalled or upgraded. # path @@ -444,8 +426,6 @@ else setenv MANPATH %{_mandir}: endif -# MPI_ROOT -setenv MPI_ROOT %{_prefix} EOF %endif # End of shell_scripts if @@ -465,30 +445,6 @@ rm -rf $RPM_BUILD_DIR/%{name}-%{version} test "x$RPM_BUILD_ROOT" != "x" && rm -rf $RPM_BUILD_ROOT -############################################################################# -# -# Post Install Section -# -############################################################################# -%if %{use_mpi_selector} -%post -%{mpi_selector} \ - --register %{name}-%{version} \ - --source-dir %{shell_scripts_path} \ - --yes -%endif - -############################################################################# -# -# Pre Uninstall Section -# -############################################################################# -%if %{use_mpi_selector} -%preun -%{mpi_selector} --unregister %{name}-%{version} --yes || \ - /bin/true > /dev/null 2> /dev/null -%endif - ############################################################################# # # Files Section @@ -504,13 +460,20 @@ test "x$RPM_BUILD_ROOT" != "x" && rm -rf $RPM_BUILD_ROOT %files %defattr(-, root, root, -) +%if %(test "%{_prefix}" = "/usr" && echo 1 || echo 0) +#%{_bindir}/* +%{_includedir}/* +%{_libdir}/* +%{_datadir} +%else %{_prefix} -# If the sysconfdir is not under the prefix, then list it explicitly. -%if !%{sysconfdir_in_prefix} -%{_sysconfdir} %endif -# If %{install_in_opt}, then we're installing PMIx to -# /opt/pmix/. But be sure to also explicitly mention +# If the sysconfdir is not under the prefix, then list it explicitly. +#%if !%{sysconfdir_in_prefix} +#%{_sysconfdir}/* +#%endif +# If %{install_in_opt}, then we're instaling PMIx to +# /opt/pmix. But be sure to also explicitly mention # /opt/pmix so that it can be removed by RPM when everything under # there is also removed. %if %{install_in_opt} @@ -527,14 +490,22 @@ test "x$RPM_BUILD_ROOT" != "x" && rm -rf $RPM_BUILD_ROOT %endif %doc README INSTALL LICENSE + ############################################################################# # # Changelog # ############################################################################# %changelog -* Fri Jun 19 2015 Ralph H. Castain -- Port to PMIx +* Tue Sep 12 2017 Ralph Castain +- Port to pmix + +* Tue Mar 28 2017 Jeff Squyres +- Reverting a decision from a prior changelog entry: if + install_in_opt==1, then even put the modulefile under /opt. + +* Thu Nov 12 2015 Gilles Gouaillardet +- Revamp packaging when prefix is /usr * Tue Jan 20 2015 Bert Wesarg - Remove VampirTrace wrapper from package. @@ -545,7 +516,7 @@ test "x$RPM_BUILD_ROOT" != "x" && rm -rf $RPM_BUILD_ROOT fields in case %{name} is overridden. * Mon Jun 24 2013 Igor Ivanov -- Add Open SHMEM parallel programming library as part of Open MPI +- Add Open parallel programming library as part of PMIx * Tue Dec 11 2012 Jeff Squyres - Re-release 1.6.0-1.6.3 SRPMs (with new SRPM Release numbers) with @@ -593,7 +564,7 @@ test "x$RPM_BUILD_ROOT" != "x" && rm -rf $RPM_BUILD_ROOT leave_build_root kludge nastyness. W00t! * Fri Jan 18 2008 Jeff Squyres -- Remove the hard-coded "pmix" name from two Requires statements +- Remove the hard-coded "openmpi" name from two Requires statements and use %{name} instead (FWIW, %{_name} caused rpmbuild to barf). * Wed Jan 2 2008 Jeff Squyres @@ -683,4 +654,3 @@ test "x$RPM_BUILD_ROOT" != "x" && rm -rf $RPM_BUILD_ROOT * Wed Mar 23 2005 Mezzanine - Specfile auto-generated by Mezzanine - diff --git a/opal/mca/pmix/pmix2x/pmix/src/class/pmix_value_array.c b/opal/mca/pmix/pmix2x/pmix/src/class/pmix_value_array.c index 166759beee..31c19c15c4 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/class/pmix_value_array.c +++ b/opal/mca/pmix/pmix2x/pmix/src/class/pmix_value_array.c @@ -9,7 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2016 Intel, Inc. All rights reserved + * Copyright (c) 2016-2017 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow diff --git a/opal/mca/pmix/pmix2x/pmix/src/mca/base/pmix_mca_base_component_compare.c b/opal/mca/pmix/pmix2x/pmix/src/mca/base/pmix_mca_base_component_compare.c index 78ae2781a9..8f1fed5e56 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/mca/base/pmix_mca_base_component_compare.c +++ b/opal/mca/pmix/pmix2x/pmix/src/mca/base/pmix_mca_base_component_compare.c @@ -9,7 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2016 Intel, Inc. All rights reserved + * Copyright (c) 2016-2017 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow diff --git a/opal/mca/pmix/pmix2x/pmix/src/mca/bfrops/base/bfrop_base_fns.c b/opal/mca/pmix/pmix2x/pmix/src/mca/bfrops/base/bfrop_base_fns.c index 56c6a0164c..5abf19c1e8 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/mca/bfrops/base/bfrop_base_fns.c +++ b/opal/mca/pmix/pmix2x/pmix/src/mca/bfrops/base/bfrop_base_fns.c @@ -510,10 +510,14 @@ pmix_status_t pmix_bfrops_base_value_xfer(pmix_value_t *p, memcpy(&p->data.status, &src->data.status, sizeof(pmix_status_t)); break; case PMIX_PROC: - memcpy(&p->data.proc, &src->data.proc, sizeof(pmix_proc_t)); + PMIX_PROC_CREATE(p->data.proc, 1); + if (NULL == p->data.proc) { + return PMIX_ERR_NOMEM; + } + memcpy(p->data.proc, src->data.proc, sizeof(pmix_proc_t)); break; case PMIX_PROC_RANK: - memcpy(&p->data.proc, &src->data.rank, sizeof(pmix_rank_t)); + memcpy(&p->data.rank, &src->data.rank, sizeof(pmix_rank_t)); break; case PMIX_BYTE_OBJECT: case PMIX_COMPRESSED_STRING: diff --git a/opal/mca/pmix/pmix2x/pmix/src/mca/pinstalldirs/config/configure.m4 b/opal/mca/pmix/pmix2x/pmix/src/mca/pinstalldirs/config/configure.m4 index c057b438d0..a73172e07a 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/mca/pinstalldirs/config/configure.m4 +++ b/opal/mca/pmix/pmix2x/pmix/src/mca/pinstalldirs/config/configure.m4 @@ -3,7 +3,7 @@ # Copyright (c) 2006 Los Alamos National Security, LLC. All rights # reserved. # Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. -# Copyright (c) 2016 Intel, Inc. All rights reserved +# Copyright (c) 2016-2017 Intel, Inc. All rights reserved. # Copyright (c) 2016 Research Organization for Information Science # and Technology (RIST). All rights reserved. # $COPYRIGHT$ diff --git a/opal/mca/pmix/pmix2x/pmix/src/mca/pinstalldirs/env/configure.m4 b/opal/mca/pmix/pmix2x/pmix/src/mca/pinstalldirs/env/configure.m4 index 47e18c3c51..90916d196e 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/mca/pinstalldirs/env/configure.m4 +++ b/opal/mca/pmix/pmix2x/pmix/src/mca/pinstalldirs/env/configure.m4 @@ -3,7 +3,7 @@ # Copyright (c) 2006 Los Alamos National Security, LLC. All rights # reserved. # Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. -# Copyright (c) 2016 Intel, Inc. All rights reserved +# Copyright (c) 2016-2017 Intel, Inc. All rights reserved. # Copyright (c) 2016 Research Organization for Information Science # and Technology (RIST). All rights reserved. # $COPYRIGHT$ diff --git a/opal/mca/pmix/pmix2x/pmix/src/util/pmix_environ.c b/opal/mca/pmix/pmix2x/pmix/src/util/pmix_environ.c index ff5ef60da8..1e1cfaaa88 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/util/pmix_environ.c +++ b/opal/mca/pmix/pmix2x/pmix/src/util/pmix_environ.c @@ -12,7 +12,7 @@ * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2007-2013 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2014-2016 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. * Copyright (c) 2016 IBM Corporation. All rights reserved. * $COPYRIGHT$ * diff --git a/opal/mca/pmix/pmix2x/pmix/src/util/strnlen.h b/opal/mca/pmix/pmix2x/pmix/src/util/strnlen.h index ec2401e346..b467fbf339 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/util/strnlen.h +++ b/opal/mca/pmix/pmix2x/pmix/src/util/strnlen.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016 Intel, Inc. All rights reserved. + * Copyright (c) 2016-2017 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow diff --git a/opal/mca/pmix/pmix2x/pmix2x.c b/opal/mca/pmix/pmix2x/pmix2x.c index 6edf9a320f..af2489b812 100644 --- a/opal/mca/pmix/pmix2x/pmix2x.c +++ b/opal/mca/pmix/pmix2x/pmix2x.c @@ -99,6 +99,8 @@ const opal_pmix_base_module_t opal_pmix_pmix2x_module = { .resolve_nodes = pmix2x_resolve_nodes, .query = pmix2x_query, .log = pmix2x_log, + .allocate = pmix2x_allocate, + .job_control = pmix2x_job_control, /* server APIs */ .server_init = pmix2x_server_init, .server_finalize = pmix2x_server_finalize, @@ -265,9 +267,7 @@ void pmix2x_event_hdlr(size_t evhdlr_registration_id, } else { if (OPAL_SUCCESS != (rc = opal_convert_string_to_jobid(&cd->pname.jobid, source->nspace))) { OPAL_ERROR_LOG(rc); - OBJ_RELEASE(cd); - OPAL_PMIX_RELEASE_THREAD(&opal_pmix_base.lock); - return; + cd->pname.jobid = OPAL_NAME_INVALID->jobid; } cd->pname.vpid = pmix2x_convert_rank(source->rank); } @@ -750,7 +750,7 @@ void pmix2x_value_load(pmix_value_t *v, break; case OPAL_STATUS: v->type = PMIX_STATUS; - memcpy(&(v->data.status), &kv->data.status, sizeof(pmix_status_t)); + v->data.status = pmix2x_convert_opalrc(kv->data.status); break; case OPAL_VPID: v->type = PMIX_PROC_RANK; @@ -770,7 +770,7 @@ void pmix2x_value_load(pmix_value_t *v, } } if (!found) { - (void)opal_snprintf_jobid(v->data.proc->nspace, PMIX_MAX_NSLEN, kv->data.name.vpid); + (void)opal_snprintf_jobid(v->data.proc->nspace, PMIX_MAX_NSLEN, kv->data.name.jobid); } v->data.proc->rank = pmix2x_convert_opalrank(kv->data.name.vpid); break; @@ -925,7 +925,7 @@ int pmix2x_value_unload(opal_value_t *kv, break; case PMIX_STATUS: kv->type = OPAL_STATUS; - memcpy(&kv->data.status, &(v->data.status), sizeof(opal_status_t)); + kv->data.status = pmix2x_convert_rc(v->data.status); break; case PMIX_PROC_RANK: kv->type = OPAL_VPID; @@ -1185,14 +1185,7 @@ static int notify_event(int status, n=0; OPAL_LIST_FOREACH(kv, info, opal_value_t) { (void)strncpy(op->info[n].key, kv->key, PMIX_MAX_KEYLEN); - /* little dicey here as we need to convert a status, if - * provided, and it will be an int coming down to us */ - if (0 == strcmp(kv->key, OPAL_PMIX_JOB_TERM_STATUS)) { - op->info[n].value.type = PMIX_STATUS; - op->info[n].value.data.status = pmix2x_convert_opalrc(kv->data.integer); - } else { - pmix2x_value_load(&op->info[n].value, kv); - } + pmix2x_value_load(&op->info[n].value, kv); ++n; } } diff --git a/opal/mca/pmix/pmix2x/pmix2x_client.c b/opal/mca/pmix/pmix2x/pmix2x_client.c index a90e056889..7b8c897d05 100644 --- a/opal/mca/pmix/pmix2x/pmix2x_client.c +++ b/opal/mca/pmix/pmix2x/pmix2x_client.c @@ -226,6 +226,9 @@ int pmix2x_tool_init(opal_list_t *info) pinfo = NULL; ninfo = 0; } + /* we are going to get our name from the server, or we were given it by the tool, + * so mark as native launch so we don't convert back/forth */ + mca_pmix_pmix2x_component.native_launch = true; OPAL_PMIX_RELEASE_THREAD(&opal_pmix_base.lock); rc = PMIx_tool_init(&my_proc, pinfo, ninfo); @@ -245,20 +248,10 @@ int pmix2x_tool_init(opal_list_t *info) return OPAL_SUCCESS; } - if (OPAL_JOBID_INVALID == pname.jobid) { - /* store our jobid and rank */ - if (NULL != getenv(OPAL_MCA_PREFIX"orte_launch")) { - /* if we were launched by the OMPI RTE, then - * the jobid is in a special format - so get it */ - mca_pmix_pmix2x_component.native_launch = true; - opal_convert_string_to_jobid(&pname.jobid, my_proc.nspace); - } else { - /* we were launched by someone else, so make the - * jobid just be the hash of the nspace */ - OPAL_HASH_JOBID(my_proc.nspace, pname.jobid); - } - pname.vpid = pmix2x_convert_rank(my_proc.rank); - } + /* store our jobid and rank */ + opal_convert_string_to_jobid(&pname.jobid, my_proc.nspace); + pname.vpid = pmix2x_convert_rank(my_proc.rank); + /* insert this into our list of jobids - it will be the * first, and so we'll check it first */ job = OBJ_NEW(opal_pmix2x_jobid_trkr_t); @@ -1154,6 +1147,9 @@ int pmix2x_spawn(opal_list_t *job_info, opal_list_t *apps, opal_jobid_t *jobid) if (NULL != app->env) { papps[n].env = opal_argv_copy(app->env); } + if (NULL != app->cwd) { + papps[n].cwd = strdup(app->cwd); + } papps[n].maxprocs = app->maxprocs; if (0 < (papps[n].ninfo = opal_list_get_size(&app->info))) { PMIX_INFO_CREATE(papps[n].info, papps[n].ninfo); diff --git a/opal/runtime/opal_init.c b/opal/runtime/opal_init.c index fecf3b566b..03ffa7118d 100644 --- a/opal/runtime/opal_init.c +++ b/opal/runtime/opal_init.c @@ -93,7 +93,7 @@ static int opal_err2str(int errnum, const char **errmsg) { const char *retval; - +opal_output(0, "OPAL ERR2STR %d", errnum); switch (errnum) { case OPAL_SUCCESS: retval = "Success"; diff --git a/orte/mca/ess/alps/ess_alps_module.c b/orte/mca/ess/alps/ess_alps_module.c index 1109c360e2..824765adfe 100644 --- a/orte/mca/ess/alps/ess_alps_module.c +++ b/orte/mca/ess/alps/ess_alps_module.c @@ -111,7 +111,7 @@ static int rte_init(void) if (ORTE_PROC_IS_TOOL) { /* otherwise, if I am a tool proc, use that procedure */ - if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup())) { + if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup(0))) { ORTE_ERROR_LOG(ret); error = "orte_ess_base_tool_setup"; goto fn_fail; diff --git a/orte/mca/ess/base/base.h b/orte/mca/ess/base/base.h index de3734b0ed..5c8cd7b9da 100644 --- a/orte/mca/ess/base/base.h +++ b/orte/mca/ess/base/base.h @@ -65,7 +65,7 @@ ORTE_DECLSPEC int orte_ess_base_app_setup(bool db_restrict_local); ORTE_DECLSPEC int orte_ess_base_app_finalize(void); ORTE_DECLSPEC void orte_ess_base_app_abort(int status, bool report); -ORTE_DECLSPEC int orte_ess_base_tool_setup(void); +ORTE_DECLSPEC int orte_ess_base_tool_setup(uint8_t flags); ORTE_DECLSPEC int orte_ess_base_tool_finalize(void); ORTE_DECLSPEC int orte_ess_base_orted_setup(void); diff --git a/orte/mca/ess/base/ess_base_std_tool.c b/orte/mca/ess/base/ess_base_std_tool.c index b911292f67..db3cc97632 100644 --- a/orte/mca/ess/base/ess_base_std_tool.c +++ b/orte/mca/ess/base/ess_base_std_tool.c @@ -38,20 +38,18 @@ #include "opal/mca/pmix/base/base.h" #include "opal/runtime/opal.h" #include "opal/runtime/opal_cr.h" +#include "opal/runtime/opal_progress_threads.h" #include "opal/util/arch.h" #include "opal/util/proc.h" #include "orte/mca/oob/base/base.h" #include "orte/mca/plm/base/base.h" #include "orte/mca/rml/base/base.h" +#include "orte/mca/rml/base/rml_contact.h" #include "orte/mca/routed/base/base.h" #include "orte/mca/errmgr/base/base.h" #include "orte/mca/iof/base/base.h" #include "orte/mca/state/base/base.h" -#if OPAL_ENABLE_FT_CR == 1 -#include "orte/mca/snapc/base/base.h" -#include "orte/mca/sstore/base/base.h" -#endif #include "orte/util/proc_info.h" #include "orte/util/session_dir.h" #include "orte/util/show_help.h" @@ -63,13 +61,51 @@ #include "orte/mca/ess/base/base.h" -int orte_ess_base_tool_setup(void) +static void infocb(int status, + opal_list_t *info, + void *cbdata, + opal_pmix_release_cbfunc_t release_fn, + void *release_cbdata) +{ + opal_value_t *kv; + opal_pmix_lock_t *lock = (opal_pmix_lock_t*)cbdata; + + if (OPAL_SUCCESS != status) { + ORTE_ERROR_LOG(status); + } else { + kv = (opal_value_t*)opal_list_get_first(info); + if (NULL == kv) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_SUPPORTED); + } else { + if (0 == strcmp(kv->key, OPAL_PMIX_SERVER_URI)) { + orte_process_info.my_hnp_uri = strdup(kv->data.string); + } else { + ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); + } + } + } + if (NULL != release_fn) { + release_fn(release_cbdata); + } + OPAL_PMIX_WAKEUP_THREAD(lock); +} + +int orte_ess_base_tool_setup(uint8_t flags) { int ret; char *error = NULL; opal_list_t transports; orte_jobid_t jobid; orte_vpid_t vpid; + opal_list_t info; + opal_value_t *kv, val; + opal_pmix_query_t *q; + opal_pmix_lock_t lock; + opal_buffer_t *buf; + + /* we need an external progress thread to ensure that things run + * async with the PMIx code */ + orte_event_base = opal_progress_thread_init("tool"); /* setup the PMIx framework - ensure it skips all non-PMIx components, * but do not override anything we were given */ @@ -84,7 +120,13 @@ int orte_ess_base_tool_setup(void) error = "opal_pmix_base_select"; goto error; } - /* set the event base */ + if (NULL == opal_pmix.tool_init) { + /* we no longer support non-pmix tools */ + error = "opal_pmix.tool_init"; + ret = ORTE_ERR_NOT_SUPPORTED; + goto error; + } + /* set the event base for the pmix component code */ opal_pmix_base_set_evbase(orte_event_base); /* we have to define our name here */ @@ -126,44 +168,68 @@ int orte_ess_base_tool_setup(void) /* initialize - PMIx may set our name here if we attach to * a PMIx server */ - if (NULL != opal_pmix.tool_init) { - opal_list_t info; - opal_value_t *kv; - OBJ_CONSTRUCT(&info, opal_list_t); - /* pass our name so the PMIx layer can use it */ + OBJ_CONSTRUCT(&info, opal_list_t); + /* pass our name so the PMIx layer can use it */ + kv = OBJ_NEW(opal_value_t); + kv->key = strdup(OPAL_PMIX_TOOL_NSPACE); + orte_util_convert_jobid_to_string(&kv->data.string, ORTE_PROC_MY_NAME->jobid); + kv->type = OPAL_STRING; + opal_list_append(&info, &kv->super); + /* ditto for our rank */ + kv = OBJ_NEW(opal_value_t); + kv->key = strdup(OPAL_PMIX_TOOL_RANK); + kv->data.name.vpid = ORTE_PROC_MY_NAME->vpid; + kv->type = OPAL_VPID; + opal_list_append(&info, &kv->super); + if (0 != flags) { + /* instruct the PMIx layer on if/how to connect */ kv = OBJ_NEW(opal_value_t); - kv->key = strdup(OPAL_PMIX_TOOL_NSPACE); - orte_util_convert_jobid_to_string(&kv->data.string, ORTE_PROC_MY_NAME->jobid); - kv->type = OPAL_STRING; - opal_list_append(&info, &kv->super); - /* ditto for our rank */ - kv = OBJ_NEW(opal_value_t); - kv->key = strdup(OPAL_PMIX_TOOL_RANK); - kv->data.name.vpid = ORTE_PROC_MY_NAME->vpid; - kv->type = OPAL_VPID; - opal_list_append(&info, &kv->super); - /* ORTE tools don't need to connect to a PMIx server as - * they will connect via the OOB */ - kv = OBJ_NEW(opal_value_t); - kv->key = strdup(OPAL_PMIX_TOOL_DO_NOT_CONNECT); + if (0x01 == flags) { + kv->key = strdup(OPAL_PMIX_TOOL_DO_NOT_CONNECT); + } else if (0x02 == flags) { + kv->key = strdup(OPAL_PMIX_CONNECT_SYSTEM_FIRST); + } else if (0x04 == flags) { + kv->key = strdup(OPAL_PMIX_CONNECT_TO_SYSTEM); + } else { + opal_output(0, "UNKNOWN CONNECTION FLAG %0x", flags); + error = "unknown connection flags"; + ret = ORTE_ERR_BAD_PARAM; + OPAL_LIST_DESTRUCT(&info); + OBJ_RELEASE(kv); + goto error; + } kv->data.flag = true; kv->type = OPAL_BOOL; opal_list_append(&info, &kv->super); - if (OPAL_SUCCESS != (ret = opal_pmix.tool_init(&info))) { - ORTE_ERROR_LOG(ret); - error = "opal_pmix.init"; - OPAL_LIST_DESTRUCT(&info); - goto error; - } - OPAL_LIST_DESTRUCT(&info); - ORTE_PROC_MY_NAME->jobid = OPAL_PROC_MY_NAME.jobid; - ORTE_PROC_MY_NAME->vpid = OPAL_PROC_MY_NAME.vpid; } + if (OPAL_SUCCESS != (ret = opal_pmix.tool_init(&info))) { + ORTE_ERROR_LOG(ret); + error = "opal_pmix.init"; + OPAL_LIST_DESTRUCT(&info); + goto error; + } + OPAL_LIST_DESTRUCT(&info); + ORTE_PROC_MY_NAME->jobid = OPAL_PROC_MY_NAME.jobid; + ORTE_PROC_MY_NAME->vpid = OPAL_PROC_MY_NAME.vpid; + orte_process_info.super.proc_hostname = strdup(orte_process_info.nodename); orte_process_info.super.proc_flags = OPAL_PROC_ALL_LOCAL; orte_process_info.super.proc_arch = opal_local_arch; opal_proc_local_set(&orte_process_info.super); + if (NULL != opal_pmix.query) { + /* query the server for its URI so we can get any IO forwarded to us */ + OBJ_CONSTRUCT(&info, opal_list_t); + q = OBJ_NEW(opal_pmix_query_t); + opal_argv_append_nosize(&q->keys, OPAL_PMIX_SERVER_URI); + opal_list_append(&info, &q->super); + OPAL_PMIX_CONSTRUCT_LOCK(&lock); + opal_pmix.query(&info, infocb, &lock); + OPAL_PMIX_WAIT_THREAD(&lock); + OPAL_PMIX_DESTRUCT_LOCK(&lock); + OPAL_LIST_DESTRUCT(&info); + } + /* open and setup the state machine */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_state_base_framework, 0))) { ORTE_ERROR_LOG(ret); @@ -227,12 +293,6 @@ int orte_ess_base_tool_setup(void) orte_mgmt_conduit = orte_rml.open_conduit(&transports); OPAL_LIST_DESTRUCT(&transports); - /* since I am a tool, then all I really want to do is communicate. - * So setup communications and be done - finding the HNP - * to which I want to communicate and setting up a route for - * that link is my responsibility - */ - /* we -may- need to know the name of the head * of our session directory tree, particularly the * tmp base where any other session directories on @@ -248,7 +308,52 @@ int orte_ess_base_tool_setup(void) /* setup I/O forwarding system - must come after we init routes */ if (NULL != orte_process_info.my_hnp_uri) { - /* only do this if we were given an HNP */ + /* extract the name */ + if (ORTE_SUCCESS != orte_rml_base_parse_uris(orte_process_info.my_hnp_uri, ORTE_PROC_MY_HNP, NULL)) { + orte_show_help("help-orte-top.txt", "orte-top:hnp-uri-bad", true, orte_process_info.my_hnp_uri); + exit(1); + } + /* Set the contact info in the RML - this won't actually establish + * the connection, but just tells the RML how to reach the HNP + * if/when we attempt to send to it + */ + OBJ_CONSTRUCT(&val, opal_value_t); + val.key = OPAL_PMIX_PROC_URI; + val.type = OPAL_STRING; + val.data.string = orte_process_info.my_hnp_uri; + if (OPAL_SUCCESS != (ret = opal_pmix.store_local(ORTE_PROC_MY_HNP, &val))) { + ORTE_ERROR_LOG(ret); + val.key = NULL; + val.data.string = NULL; + OBJ_DESTRUCT(&val); + error = "store HNP URI"; + goto error; + } + val.key = NULL; + val.data.string = NULL; + OBJ_DESTRUCT(&val); + /* set the route to be direct */ + if (ORTE_SUCCESS != orte_routed.update_route(NULL, ORTE_PROC_MY_HNP, ORTE_PROC_MY_HNP)) { + orte_show_help("help-orte-top.txt", "orte-top:hnp-uri-bad", true, orte_process_info.my_hnp_uri); + orte_finalize(); + exit(1); + } + + /* connect to the HNP so we can recv forwarded output */ + buf = OBJ_NEW(opal_buffer_t); + ret = orte_rml.send_buffer_nb(orte_mgmt_conduit, ORTE_PROC_MY_HNP, + buf, ORTE_RML_TAG_WARMUP_CONNECTION, + orte_rml_send_callback, NULL); + if (ORTE_SUCCESS != ret) { + ORTE_ERROR_LOG(ret); + error = "warmup connection"; + goto error; + } + + /* set the target hnp as our lifeline so we will terminate if it exits */ + orte_routed.set_lifeline(NULL, ORTE_PROC_MY_HNP); + + /* setup the IOF */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_iof_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_iof_base_open"; @@ -259,46 +364,8 @@ int orte_ess_base_tool_setup(void) error = "orte_iof_base_select"; goto error; } - /* if we were given an HNP, then also setup the PLM in case this - * tool wants to request that we spawn something for it */ - if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_plm_base_framework, 0))) { - ORTE_ERROR_LOG(ret); - error = "orte_plm_base_open"; - goto error; - } - /* we don't select the plm framework as we only want the - * base proxy functions */ - } -#if OPAL_ENABLE_FT_CR == 1 - /* - * Setup the SnapC - */ - if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_snapc_base_framework, 0))) { - ORTE_ERROR_LOG(ret); - error = "orte_snapc_base_open"; - goto error; } - if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_sstore_base_framework, 0))) { - ORTE_ERROR_LOG(ret); - error = "orte_sstore_base_open"; - goto error; - } - - if (ORTE_SUCCESS != (ret = orte_snapc_base_select(ORTE_PROC_IS_HNP, ORTE_PROC_IS_APP))) { - ORTE_ERROR_LOG(ret); - error = "orte_snapc_base_select"; - goto error; - } - if (ORTE_SUCCESS != (ret = orte_sstore_base_select())) { - ORTE_ERROR_LOG(ret); - error = "orte_sstore_base_select"; - goto error; - } - - /* Tools do not need all the OPAL CR stuff */ - opal_cr_set_enabled(false); -#endif return ORTE_SUCCESS; @@ -314,11 +381,6 @@ int orte_ess_base_tool_finalize(void) { orte_wait_finalize(); -#if OPAL_ENABLE_FT_CR == 1 - mca_base_framework_close(&orte_snapc_base_framework); - mca_base_framework_close(&orte_sstore_base_framework); -#endif - orte_rml.close_conduit(orte_mgmt_conduit); /* if I am a tool, then all I will have done is diff --git a/orte/mca/ess/lsf/ess_lsf_module.c b/orte/mca/ess/lsf/ess_lsf_module.c index cb200e4df3..20c2d1270a 100644 --- a/orte/mca/ess/lsf/ess_lsf_module.c +++ b/orte/mca/ess/lsf/ess_lsf_module.c @@ -92,7 +92,7 @@ static int rte_init(void) if (ORTE_PROC_IS_TOOL) { /* otherwise, if I am a tool proc, use that procedure */ - if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup())) { + if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup(0))) { ORTE_ERROR_LOG(ret); error = "orte_ess_base_tool_setup"; goto error; diff --git a/orte/mca/ess/slurm/ess_slurm_module.c b/orte/mca/ess/slurm/ess_slurm_module.c index 59f23099b0..4bc318904e 100644 --- a/orte/mca/ess/slurm/ess_slurm_module.c +++ b/orte/mca/ess/slurm/ess_slurm_module.c @@ -87,7 +87,7 @@ static int rte_init(void) if (ORTE_PROC_IS_TOOL) { /* otherwise, if I am a tool proc, use that procedure */ - if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup())) { + if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup(0))) { ORTE_ERROR_LOG(ret); error = "orte_ess_base_tool_setup"; goto error; diff --git a/orte/mca/ess/tm/ess_tm_module.c b/orte/mca/ess/tm/ess_tm_module.c index b9fe8e0cbe..02e391ea00 100644 --- a/orte/mca/ess/tm/ess_tm_module.c +++ b/orte/mca/ess/tm/ess_tm_module.c @@ -91,7 +91,7 @@ static int rte_init(void) if (ORTE_PROC_IS_TOOL) { /* otherwise, if I am a tool proc, use that procedure */ - if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup())) { + if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup(0))) { ORTE_ERROR_LOG(ret); error = "orte_ess_base_tool_setup"; goto error; diff --git a/orte/mca/ess/tool/ess_tool.h b/orte/mca/ess/tool/ess_tool.h index e2eacc12c5..5e4ed06f28 100644 --- a/orte/mca/ess/tool/ess_tool.h +++ b/orte/mca/ess/tool/ess_tool.h @@ -9,7 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2015 Intel, Inc. All rights reserved + * Copyright (c) 2015-2017 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -32,6 +32,9 @@ int orte_ess_tool_component_query(mca_base_module_t **module, int *priority); typedef struct { orte_ess_base_component_t super; bool async; + bool system_server_first; + bool system_server_only; + bool do_not_connect; } orte_ess_tool_component_t; ORTE_MODULE_DECLSPEC extern orte_ess_tool_component_t mca_ess_tool_component; diff --git a/orte/mca/ess/tool/ess_tool_component.c b/orte/mca/ess/tool/ess_tool_component.c index 8836464ba2..4a316bb98e 100644 --- a/orte/mca/ess/tool/ess_tool_component.c +++ b/orte/mca/ess/tool/ess_tool_component.c @@ -12,7 +12,7 @@ * All rights reserved. * Copyright (c) 2015 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2015 Intel, Inc. All rights reserved + * Copyright (c) 2015-2017 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -76,6 +76,30 @@ static int tool_component_register(void) OPAL_INFO_LVL_2, MCA_BASE_VAR_SCOPE_READONLY, &mca_ess_tool_component.async); + + mca_ess_tool_component.do_not_connect = false; + (void) mca_base_component_var_register (c, "do_not_connect", + "Do not connect to a PMIx server", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, + OPAL_INFO_LVL_2, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_ess_tool_component.do_not_connect); + + mca_ess_tool_component.system_server_first = false; + (void) mca_base_component_var_register (c, "system_server_first", + "Look for a system PMIx server first", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, + OPAL_INFO_LVL_2, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_ess_tool_component.system_server_first); + + mca_ess_tool_component.system_server_only = false; + (void) mca_base_component_var_register (c, "system_server_only", + "Only connect to a system server (and not an mpirun)", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, + OPAL_INFO_LVL_2, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_ess_tool_component.system_server_only); return ORTE_SUCCESS; } @@ -111,4 +135,3 @@ orte_ess_tool_component_close(void) { return ORTE_SUCCESS; } - diff --git a/orte/mca/ess/tool/ess_tool_module.c b/orte/mca/ess/tool/ess_tool_module.c index 319a80acff..d190b11dc9 100644 --- a/orte/mca/ess/tool/ess_tool_module.c +++ b/orte/mca/ess/tool/ess_tool_module.c @@ -63,6 +63,7 @@ static int rte_init(void) { int ret; char *error = NULL; + uint8_t flags; /* run the prolog */ if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) { @@ -79,8 +80,18 @@ static int rte_init(void) progress_thread_running = true; } + /* setup the tool connection flags */ + flags = 0; + if (mca_ess_tool_component.do_not_connect) { + flags = 0x01; + } else if (mca_ess_tool_component.system_server_first) { + flags = 0x02; + } else if (mca_ess_tool_component.system_server_only) { + flags = 0x04; + } + /* do the standard tool init */ - if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup())) { + if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup(flags))) { ORTE_ERROR_LOG(ret); error = "orte_ess_base_tool_setup"; goto error; diff --git a/orte/mca/iof/iof.h b/orte/mca/iof/iof.h index 56e4775bdf..4d00a7186a 100644 --- a/orte/mca/iof/iof.h +++ b/orte/mca/iof/iof.h @@ -133,29 +133,29 @@ BEGIN_C_DECLS * behalf of a tool that had the HNP spawn a job. First * argument is the orte_job_t of the spawned job, second * is a pointer to the name of the requesting tool */ -#define ORTE_IOF_PROXY_PULL(a, b) \ - do { \ - opal_buffer_t *buf; \ - orte_iof_tag_t tag; \ - orte_process_name_t nm; \ - \ - buf = OBJ_NEW(opal_buffer_t); \ - \ - /* setup the tag to pull from HNP */ \ - tag = ORTE_IOF_STDOUTALL | ORTE_IOF_PULL; \ - opal_dss.pack(buf, &tag, 1, ORTE_IOF_TAG); \ - /* pack the name of the source we want to pull */ \ - nm.jobid = (a)->jobid; \ - nm.vpid = ORTE_VPID_WILDCARD; \ - opal_dss.pack(buf, &nm, 1, ORTE_NAME); \ - /* pack the name of the tool */ \ - opal_dss.pack(buf, (b), 1, ORTE_NAME); \ - \ - /* send the buffer to the HNP */ \ - orte_rml.send_buffer_nb(orte_mgmt_conduit, \ - ORTE_PROC_MY_HNP, buf, \ - ORTE_RML_TAG_IOF_HNP, \ - orte_rml_send_callback, NULL); \ +#define ORTE_IOF_PROXY_PULL(a, b) \ + do { \ + opal_buffer_t *buf; \ + orte_iof_tag_t tag; \ + orte_process_name_t nm; \ + \ + buf = OBJ_NEW(opal_buffer_t); \ + \ + /* setup the tag to pull from HNP */ \ + tag = ORTE_IOF_STDOUTALL | ORTE_IOF_PULL | ORTE_IOF_EXCLUSIVE; \ + opal_dss.pack(buf, &tag, 1, ORTE_IOF_TAG); \ + /* pack the name of the source we want to pull */ \ + nm.jobid = (a)->jobid; \ + nm.vpid = ORTE_VPID_WILDCARD; \ + opal_dss.pack(buf, &nm, 1, ORTE_NAME); \ + /* pack the name of the tool */ \ + opal_dss.pack(buf, (b), 1, ORTE_NAME); \ + \ + /* send the buffer to the HNP */ \ + orte_rml.send_buffer_nb(orte_mgmt_conduit, \ + ORTE_PROC_MY_HNP, buf, \ + ORTE_RML_TAG_IOF_HNP, \ + orte_rml_send_callback, NULL); \ } while(0); /* Initialize the selected module */ diff --git a/orte/mca/plm/base/plm_base_launch_support.c b/orte/mca/plm/base/plm_base_launch_support.c index eff1ad8a60..fbed5caa29 100644 --- a/orte/mca/plm/base/plm_base_launch_support.c +++ b/orte/mca/plm/base/plm_base_launch_support.c @@ -387,6 +387,7 @@ void orte_plm_base_complete_setup(int fd, short args, void *cbdata) orte_vpid_t *vptr; int i, rc; char *serial_number; + orte_process_name_t requestor, *rptr; ORTE_ACQUIRE_OBJECT(caddy); @@ -425,7 +426,12 @@ void orte_plm_base_complete_setup(int fd, short args, void *cbdata) * indicating that request */ if (orte_get_attribute(&jdata->attributes, ORTE_JOB_FWDIO_TO_TOOL, NULL, OPAL_BOOL)) { /* send a message to our IOF containing the requested pull */ - ORTE_IOF_PROXY_PULL(jdata, &jdata->originator); + rptr = &requestor; + if (orte_get_attribute(&jdata->attributes, ORTE_JOB_LAUNCH_PROXY, (void**)&rptr, OPAL_NAME)) { + ORTE_IOF_PROXY_PULL(jdata, rptr); + } else { + ORTE_IOF_PROXY_PULL(jdata, &jdata->originator); + } /* the tool will PUSH its stdin, so nothing we need to do here * about stdin */ } diff --git a/orte/orted/pmix/pmix_server_dyn.c b/orte/orted/pmix/pmix_server_dyn.c index 16680dc19b..0e29e23ef8 100644 --- a/orte/orted/pmix/pmix_server_dyn.c +++ b/orte/orted/pmix/pmix_server_dyn.c @@ -240,6 +240,9 @@ int pmix_server_spawn_fn(opal_process_name_t *requestor, } else if (0 == strcmp(info->key, OPAL_PMIX_REQUESTOR_IS_TOOL)) { orte_set_attribute(&jdata->attributes, ORTE_JOB_DVM_JOB, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL); + /* request that IO be forwarded to the requesting tool */ + orte_set_attribute(&jdata->attributes, ORTE_JOB_FWDIO_TO_TOOL, + ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL); } else if (0 == strcmp(info->key, OPAL_PMIX_STDIN_TGT)) { if (0 == strcmp(info->data.string, "all")) { jdata->stdin_target = ORTE_VPID_WILDCARD; diff --git a/orte/orted/pmix/pmix_server_gen.c b/orte/orted/pmix/pmix_server_gen.c index d91e2aa88f..94e2cd8a0f 100644 --- a/orte/orted/pmix/pmix_server_gen.c +++ b/orte/orted/pmix/pmix_server_gen.c @@ -676,6 +676,13 @@ static void _query(int sd, short args, void *cbdata) opal_list_append(results, &kv->super); } #endif + } else if (0 == strcmp(q->keys[n], OPAL_PMIX_SERVER_URI)) { + /* they want our URI */ + kv = OBJ_NEW(opal_value_t); + kv->key = strdup(OPAL_PMIX_SERVER_URI); + kv->type = OPAL_STRING; + kv->data.string = strdup(orte_process_info.my_hnp_uri); + opal_list_append(results, &kv->super); } } } diff --git a/orte/tools/Makefile.am b/orte/tools/Makefile.am index 8537277b88..ce477ff9aa 100644 --- a/orte/tools/Makefile.am +++ b/orte/tools/Makefile.am @@ -53,9 +53,7 @@ DIST_SUBDIRS += \ tools/prun if OPAL_WANT_PRUN -if WANT_INSTALL_HEADERS SUBDIRS += \ tools/prun \ tools/orte-dvm endif -endif diff --git a/orte/tools/orte-dvm/orte-dvm.c b/orte/tools/orte-dvm/orte-dvm.c index 08dc2319d6..745c5f97eb 100644 --- a/orte/tools/orte-dvm/orte-dvm.c +++ b/orte/tools/orte-dvm/orte-dvm.c @@ -502,6 +502,14 @@ static void notify_requestor(int sd, short args, void *cbdata) if (notify) { info = OBJ_NEW(opal_list_t); + /* ensure this only goes to the job terminated event handler */ + val = OBJ_NEW(opal_value_t); + val->key = strdup(OPAL_PMIX_EVENT_NON_DEFAULT); + val->type = OPAL_BOOL; + val->data.flag = true; + opal_list_append(info, &val->super); + /* tell the server not to cache the event as subsequent jobs + * do not need to know about it */ val = OBJ_NEW(opal_value_t); val->key = strdup(OPAL_PMIX_EVENT_DO_NOT_CACHE); val->type = OPAL_BOOL; @@ -510,15 +518,20 @@ static void notify_requestor(int sd, short args, void *cbdata) /* provide the status */ val = OBJ_NEW(opal_value_t); val->key = strdup(OPAL_PMIX_JOB_TERM_STATUS); - val->type = OPAL_INT; - val->data.integer = ret; + val->type = OPAL_STATUS; + val->data.status = ret; opal_list_append(info, &val->super); /* if there was a problem, we need to send the requestor more info about what happened */ if (0 < ret) { val = OBJ_NEW(opal_value_t); val->key = strdup(OPAL_PMIX_PROCID); val->type = OPAL_NAME; - val->data.name = pptr->name; + val->data.name.jobid = jdata->jobid; + if (NULL != pptr) { + val->data.name.vpid = pptr->name.vpid; + } else { + val->data.name.vpid = ORTE_VPID_WILDCARD; + } opal_list_append(info, &val->super); } opal_pmix.notify_event(OPAL_ERR_JOB_TERMINATED, NULL, diff --git a/orte/tools/prun/Makefile.am b/orte/tools/prun/Makefile.am index 7854076cc9..165a1da07d 100644 --- a/orte/tools/prun/Makefile.am +++ b/orte/tools/prun/Makefile.am @@ -26,10 +26,7 @@ # post-processed forms of the CFLAGS in the library targets down # below. -AM_CPPFLAGS = $(OPAL_PMIX_CPPFLAGS) - CFLAGS = $(CFLAGS_WITHOUT_OPTFLAGS) $(DEBUGGER_CFLAGS) -AM_LDFLAGS = $(OPAL_PMIX_LDFLAGS) include $(top_srcdir)/Makefile.ompi-rules @@ -56,10 +53,7 @@ prun_SOURCES = \ prun_LDADD = \ $(top_builddir)/orte/lib@ORTE_LIB_PREFIX@open-rte.la \ - $(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la \ - $(OPAL_PMIX_LDADD) - -prun_LIBS = $(OPAL_PMIX_LIBS) + $(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la distclean-local: rm -f $(man_pages) diff --git a/orte/tools/prun/prun.c b/orte/tools/prun/prun.c index 38f5555dcd..a96ad46e5c 100644 --- a/orte/tools/prun/prun.c +++ b/orte/tools/prun/prun.c @@ -69,22 +69,23 @@ #include "opal/util/show_help.h" #include "opal/util/fd.h" #include "opal/sys/atomic.h" -#if OPAL_ENABLE_FT_CR == 1 -#include "opal/runtime/opal_cr.h" -#endif #include "opal/version.h" #include "opal/runtime/opal.h" #include "opal/runtime/opal_info_support.h" +#include "opal/runtime/opal_progress_threads.h" #include "opal/util/os_path.h" #include "opal/util/path.h" #include "opal/class/opal_pointer_array.h" #include "opal/dss/dss.h" +#include "orte/runtime/runtime.h" +#include "orte/runtime/orte_globals.h" +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/state/state.h" + /* ensure I can behave like a daemon */ #include "prun.h" -#include -#include /** * Global struct for caching orte command line options. @@ -143,7 +144,7 @@ typedef struct orte_cmd_options_t orte_cmd_options_t; static orte_cmd_options_t orte_cmd_options = {0}; static opal_cmd_line_t *orte_cmd_line = NULL; static opal_list_t job_info; -static opal_pmix_lock_t globallock; +static volatile bool active = false; static int create_app(int argc, char* argv[], opal_list_t *jdata, @@ -476,10 +477,10 @@ static opal_cmd_line_init_t cmd_line_init[] = { }; -static void infocb(pmix_status_t status, - pmix_info_t *info, size_t ninfo, +static void infocb(int status, + opal_list_t *info, void *cbdata, - pmix_release_cbfunc_t release_fn, + opal_pmix_release_cbfunc_t release_fn, void *release_cbdata) { opal_pmix_lock_t *lock = (opal_pmix_lock_t*)cbdata; @@ -491,35 +492,42 @@ static void infocb(pmix_status_t status, OPAL_PMIX_WAKEUP_THREAD(lock); } -static void regcbfunc(pmix_status_t status, size_t ref, void *cbdata) +static void regcbfunc(int status, size_t ref, void *cbdata) { opal_pmix_lock_t *lock = (opal_pmix_lock_t*)cbdata; OPAL_ACQUIRE_OBJECT(lock); OPAL_PMIX_WAKEUP_THREAD(lock); } -static void evhandler(size_t evhdlr_registration_id, - pmix_status_t status, - const pmix_proc_t *source, - pmix_info_t info[], size_t ninfo, - pmix_info_t *results, size_t nresults, - pmix_event_notification_cbfunc_fn_t cbfunc, +static void release(int sd, short args, void *cbdata) +{ + active = false; +} + +static bool fired = false; +static void evhandler(int status, + const opal_process_name_t *source, + opal_list_t *info, opal_list_t *results, + opal_pmix_notification_complete_fn_t cbfunc, void *cbdata) { - size_t n; + opal_value_t *val; if (NULL != info) { - for (n=0; n < ninfo; n++) { - if (0 == strncmp(info[n].key, PMIX_JOB_TERM_STATUS, PMIX_MAX_KEYLEN)) { - opal_output(0, "JOB COMPLETED WITH STATUS %s", PMIx_Error_string(info[n].value.data.status)); + OPAL_LIST_FOREACH(val, info, opal_value_t) { + if (0 == strcmp(val->key, OPAL_PMIX_JOB_TERM_STATUS)) { + opal_output(0, "JOB COMPLETED WITH STATUS %d", + val->data.integer); } } } if (NULL != cbfunc) { - cbfunc(PMIX_SUCCESS, NULL, 0, NULL, NULL, cbdata); + cbfunc(OPAL_SUCCESS, NULL, NULL, NULL, cbdata); + } + if (!fired) { + fired = true; + ORTE_ACTIVATE_PROC_STATE(ORTE_PROC_MY_NAME, ORTE_PROC_STATE_TERMINATED); } - OPAL_ACQUIRE_OBJECT(&globallock); - OPAL_PMIX_WAKEUP_THREAD(&globallock); } @@ -530,14 +538,9 @@ int prun(int argc, char *argv[]) opal_pmix_lock_t lock; opal_list_t apps; opal_value_t *val; - opal_pmix_app_t *app; - pmix_status_t code; - char nspace[PMIX_MAX_NSLEN+1]; - pmix_info_t info; - pmix_proc_t myproc; - size_t asz, jsz; - pmix_app_t *papps = NULL; - pmix_info_t *pinfo = NULL; + opal_list_t info; + opal_jobid_t jobid; + struct timespec tp = {0, 100000}; /* init the globals */ memset(&orte_cmd_options, 0, sizeof(orte_cmd_options)); @@ -644,106 +647,85 @@ int prun(int argc, char *argv[]) return rc; } - /* use the system connection first, if available */ - PMIX_INFO_LOAD(&info, OPAL_PMIX_CONNECT_SYSTEM_FIRST, NULL, PMIX_BOOL); - /* init as a tool */ - if (OPAL_SUCCESS != PMIx_tool_init(&myproc, &info, 1)) { - fprintf(stderr, "Unable to init as tool\n"); - exit(1); + /* tell the ess/tool component that we want to connect to a system-level + * PMIx server */ + opal_setenv("OMPI_MCA_ess_tool_system_server_only", "1", true, &environ); + + /* now initialize ORTE */ + if (OPAL_SUCCESS != (rc = orte_init(&argc, &argv, ORTE_PROC_TOOL))) { + OPAL_ERROR_LOG(rc); + return rc; } - PMIX_INFO_DESTRUCT(&info); /* if the user just wants us to terminate a DVM, then do so */ if (orte_cmd_options.terminate_dvm) { - PMIX_INFO_LOAD(&info, OPAL_PMIX_JOB_CTRL_TERMINATE, NULL, PMIX_BOOL); + OBJ_CONSTRUCT(&info, opal_list_t); + val = OBJ_NEW(opal_value_t); + val->key = strdup(OPAL_PMIX_JOB_CTRL_TERMINATE); + val->type = OPAL_BOOL; + val->data.flag = true; + opal_list_append(&info, &val->super); + fprintf(stderr, "TERMINATING DVM..."); OPAL_PMIX_CONSTRUCT_LOCK(&lock); - rc = PMIx_Job_control_nb(NULL, 0, &info, 1, infocb, (void*)&lock); + rc = opal_pmix.job_control(NULL, &info, infocb, (void*)&lock); OPAL_PMIX_WAIT_THREAD(&lock); OPAL_PMIX_DESTRUCT_LOCK(&lock); - PMIX_INFO_DESTRUCT(&info); + OPAL_LIST_DESTRUCT(&info); fprintf(stderr, "DONE\n"); goto DONE; } + orte_state.add_proc_state(ORTE_PROC_STATE_TERMINATED, release, ORTE_SYS_PRI); + /* get here if they want to run an application, so let's parse * the cmd line to get it */ - if (OPAL_SUCCESS != parse_locals(&apps, argc, argv)) { - opal_output(0, "[%s:%d] SOMETHING WRONG", __FILE__, __LINE__); + if (OPAL_SUCCESS != (rc = parse_locals(&apps, argc, argv))) { + OPAL_ERROR_LOG(rc); OPAL_LIST_DESTRUCT(&apps); goto DONE; } /* bozo check */ - if (0 == (asz = opal_list_get_size(&apps))) { - opal_output(0, "[%s:%d] SOMETHING WRONG", __FILE__, __LINE__); + if (0 == opal_list_get_size(&apps)) { + opal_output(0, "No application specified!"); goto DONE; } + /* init flag */ + active = true; + /* register for job terminations so we get notified when * our job completes */ OPAL_PMIX_CONSTRUCT_LOCK(&lock); - code = PMIX_ERR_JOB_TERMINATED; - PMIx_Register_event_handler(&code, 1, NULL, 0, evhandler, regcbfunc, &lock); + OBJ_CONSTRUCT(&info, opal_list_t); + val = OBJ_NEW(opal_value_t); + val->key = strdup("foo"); + val->type = OPAL_INT; + val->data.integer = OPAL_ERR_JOB_TERMINATED; + opal_list_append(&info, &val->super); + opal_pmix.register_evhandler(&info, NULL, evhandler, regcbfunc, &lock); OPAL_PMIX_WAIT_THREAD(&lock); OPAL_PMIX_DESTRUCT_LOCK(&lock); + OPAL_LIST_DESTRUCT(&info); - /* convert the job info and apps to PMIx arrays */ - if (0 < (jsz = opal_list_get_size(&job_info))) { - PMIX_INFO_CREATE(pinfo, jsz); - i=0; - OPAL_LIST_FOREACH(val, &job_info, opal_value_t) { - (void)strncpy(pinfo[i].key, val->key, PMIX_MAX_KEYLEN); - /* we only have bool and string types here */ - if (OPAL_BOOL == val->type) { - pinfo[i].value.type = PMIX_BOOL; - pinfo[i].value.data.flag = val->data.flag; - } else if (OPAL_STRING == val->type) { - pinfo[i].value.type = PMIX_STRING; - pinfo[i].value.data.string = strdup(val->data.string); - } else { - opal_output(0, "UNSUPPORTED TYPE %d", val->type); - } - ++i; - } - } - OPAL_LIST_DESTRUCT(&job_info); - - PMIX_APP_CREATE(papps, asz); - i=0; - OPAL_LIST_FOREACH(app, &apps, opal_pmix_app_t) { - papps[i].cmd = strdup(app->cmd); - papps[i].argv = opal_argv_copy(app->argv); - papps[i].env = opal_argv_copy(app->env); - if (NULL != app->cwd) { - papps[i].cwd = strdup(app->cwd); - } - papps[i].maxprocs = app->maxprocs; - ++i; - } - OPAL_LIST_DESTRUCT(&apps); - - OPAL_PMIX_CONSTRUCT_LOCK(&globallock); - if (PMIX_SUCCESS != PMIx_Spawn(pinfo, jsz, papps, asz, nspace)) { - opal_output(0, "[%s:%d] SOMETHING WRONG", __FILE__, __LINE__); - OPAL_PMIX_DESTRUCT_LOCK(&globallock); + if (OPAL_SUCCESS != (rc = opal_pmix.spawn(&job_info, &apps, &jobid))) { + opal_output(0, "Job failed to spawn: %s", opal_strerror(rc)); goto DONE; } - opal_output(0, "JOB %s EXECUTING", nspace); - OPAL_PMIX_WAIT_THREAD(&globallock); - OPAL_PMIX_DESTRUCT_LOCK(&globallock); - if (NULL != pinfo) { - PMIX_INFO_FREE(pinfo, jsz); - } - if (NULL != papps) { - PMIX_APP_FREE(papps, asz); + OPAL_LIST_DESTRUCT(&job_info); + OPAL_LIST_DESTRUCT(&apps); + + opal_output(0, "JOB %s EXECUTING", OPAL_JOBID_PRINT(jobid)); + + while (active) { + nanosleep(&tp, NULL); } - DONE: + DONE: /* cleanup and leave */ - PMIx_tool_finalize(); - opal_finalize(); + orte_finalize(); return 0; }