1
1

Complete the fix of the ORTE DVM. We will now use "prun" instead of "orterun -hnp foo" to execute jobs. This provides the feature of automatic discovery of the orte-dvm so you don't need to manually enter URI's or contact file locations. All IO is forwarded to prun.

Still in the "needs to be done" category:

* mapping/ranking/binding options aren't correctly supported

* if the DVM encounters some errors (e.g., not enough resources for the job), the resulting error is globally set and impacts any subsequent job submission

Signed-off-by: Ralph Castain <rhc@open-mpi.org>
Этот коммит содержится в:
Ralph Castain 2017-09-15 06:50:58 -07:00
родитель bffcc3bca0
Коммит 3c914a7a97
36 изменённых файлов: 436 добавлений и 368 удалений

Просмотреть файл

@ -47,10 +47,8 @@ install-exec-hook:
(cd $(DESTDIR)$(bindir); rm -f ompi-top$(EXEEXT); $(LN_S) orte-top$(EXEEXT) ompi-top$(EXEEXT))
(cd $(DESTDIR)$(bindir); rm -f ompi-server$(EXEEXT); $(LN_S) orte-server$(EXEEXT) ompi-server$(EXEEXT))
if OPAL_WANT_PRUN
if WANT_INSTALL_HEADERS
(cd $(DESTDIR)$(bindir); rm -f ompi-dvm$(EXEEXT); $(LN_S) orte-dvm$(EXEEXT) ompi-dvm$(EXEEXT))
endif
endif
uninstall-local:
rm -f $(DESTDIR)$(bindir)/mpirun$(EXEEXT) \
@ -60,9 +58,7 @@ uninstall-local:
$(DESTDIR)$(bindir)/ompi-top$(EXEEXT) \
$(DESTDIR)$(bindir)/ompi-server$(EXEEXT)
if OPAL_WANT_PRUN
if WANT_INSTALL_HEADERS
m -f $(DESTDIR)$(bindir)/ompi-dvm$(EXEEXT)
endif
rm -f $(DESTDIR)$(bindir)/ompi-dvm$(EXEEXT)
endif
endif # OPAL_INSTALL_BINARIES
@ -119,11 +115,9 @@ ompi-server.1: $(top_builddir)/orte/tools/orte-server/orte-server.1
cp -f $(top_builddir)/orte/tools/orte-server/orte-server.1 ompi-server.1
if OPAL_WANT_PRUN
if WANT_INSTALL_HEADERS
ompi-dvm.1: $(top_builddir)/orte/tools/orte-dvm/orte-dvm.1
cp -f $(top_builddir)/orte/tools/orte-dvm/orte-dvm.1 ompi-dvm.1
endif
endif
clean-local:
rm -f $(man_pages)

Просмотреть файл

@ -10,7 +10,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2011-2013 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2014 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
@ -837,7 +837,7 @@ int opal_dss_pack_value(opal_buffer_t *buffer, const void *src,
}
break;
default:
opal_output(0, "PACK-OPAL-VALUE: UNSUPPORTED TYPE %d", (int)ptr[i]->type);
opal_output(0, "PACK-OPAL-VALUE: UNSUPPORTED TYPE %d FOR KEY %s", (int)ptr[i]->type, ptr[i]->key);
return OPAL_ERROR;
}
}
@ -981,4 +981,3 @@ int opal_dss_pack_status(opal_buffer_t *buffer, const void *src,
return ret;
}

Просмотреть файл

@ -11,7 +11,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2012-2015 Los Alamos National Security, Inc. All rights reserved.
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
@ -1086,13 +1086,21 @@ int opal_dss_unpack_value(opal_buffer_t *buffer, void *dest,
return ret;
}
break;
case OPAL_PTR:
/* just ignore these values */
break;
case OPAL_NAME:
if (OPAL_SUCCESS != (ret = opal_dss_unpack_buffer(buffer, &ptr[i]->data.name, &m, OPAL_NAME))) {
return ret;
}
break;
case OPAL_STATUS:
if (OPAL_SUCCESS != (ret = opal_dss_unpack_buffer(buffer, &ptr[i]->data.status, &m, OPAL_INT))) {
return ret;
}
break;
default:
opal_output(0, "PACK-OPAL-VALUE: UNSUPPORTED TYPE");
opal_output(0, "UNPACK-OPAL-VALUE: UNSUPPORTED TYPE %d FOR KEY %s", (int)ptr[i]->type, ptr[i]->key);
return OPAL_ERROR;
}
}

Просмотреть файл

@ -84,6 +84,8 @@ static int opal_pmix_base_frame_open(mca_base_open_flag_t flags)
rc = mca_base_framework_components_open(&opal_pmix_base_framework, flags);
/* ensure the function pointers are NULL */
memset(&opal_pmix, 0, sizeof(opal_pmix));
/* default to the OPAL event base */
opal_pmix_base.evbase = opal_sync_event_base;
/* pass across the verbosity */
opal_pmix_verbose_output = opal_pmix_base_framework.framework_output;
return rc;

Просмотреть файл

@ -48,16 +48,7 @@ AC_DEFUN([MCA_opal_pmix_ext2x_CONFIG],[
[$1
# need to set the wrapper flags for static builds
pmix_ext2x_WRAPPER_EXTRA_LDFLAGS=$opal_external_pmix_LDFLAGS
pmix_ext2x_WRAPPER_EXTRA_LIBS=$opal_external_pmix_LIBS
# and the flags for prun
OPAL_PMIX_CPPFLAGS="-I$opal_external_pmix_CPPFLAGS"
AC_SUBST(OPAL_PMIX_CPPFLAGS)
OPAL_PMIX_LDFLAGS=$opal_external_pmix_LDFLAGS
AC_SUBST(OPAL_PMIX_LDFLAGS)
OPAL_PMIX_LDADD=
AC_SUBST(OPAL_PMIX_LDADD)
OPAL_PMIX_LIBS=-lpmix
AC_SUBST(OPAL_PMIX_LIBS)],
pmix_ext2x_WRAPPER_EXTRA_LIBS=$opal_external_pmix_LIBS],
[$2])],
[$2])

Просмотреть файл

@ -852,6 +852,21 @@ typedef void (*opal_pmix_base_module_query_fn_t)(opal_list_t *queries,
typedef void (*opal_pmix_base_log_fn_t)(opal_list_t *info,
opal_pmix_op_cbfunc_t cbfunc, void *cbdata);
/* allocation */
typedef int (*opal_pmix_base_alloc_fn_t)(opal_pmix_alloc_directive_t directive,
opal_list_t *info,
opal_pmix_info_cbfunc_t cbfunc, void *cbdata);
/* job control */
typedef int (*opal_pmix_base_job_control_fn_t)(opal_list_t *targets,
opal_list_t *directives,
opal_pmix_info_cbfunc_t cbfunc, void *cbdata);
/* monitoring */
typedef int (*opal_pmix_base_process_monitor_fn_t)(opal_list_t *monitor,
opal_list_t *directives,
opal_pmix_info_cbfunc_t cbfunc, void *cbdata);
/*
* the standard public API data structure
*/
@ -883,6 +898,9 @@ typedef struct {
opal_pmix_base_module_resolve_nodes_fn_t resolve_nodes;
opal_pmix_base_module_query_fn_t query;
opal_pmix_base_log_fn_t log;
opal_pmix_base_alloc_fn_t allocate;
opal_pmix_base_job_control_fn_t job_control;
opal_pmix_base_process_monitor_fn_t monitor;
/* server APIs */
opal_pmix_base_module_server_init_fn_t server_init;
opal_pmix_base_module_server_finalize_fn_t server_finalize;

Просмотреть файл

@ -86,16 +86,7 @@ AC_DEFUN([MCA_opal_pmix_pmix2x_CONFIG],[
opal_pmix_pmix2x_LDFLAGS=
opal_pmix_pmix2x_LIBS="$OPAL_TOP_BUILDDIR/$opal_pmix_pmix2x_basedir/pmix/src/libpmix.la"
opal_pmix_pmix2x_CPPFLAGS="-I$OPAL_TOP_BUILDDIR/$opal_pmix_pmix2x_basedir/pmix/include -I$OPAL_TOP_BUILDDIR/$opal_pmix_pmix2x_basedir/pmix -I$OPAL_TOP_SRCDIR/$opal_pmix_pmix2x_basedir/pmix/include -I$OPAL_TOP_SRCDIR/$opal_pmix_pmix2x_basedir/pmix"
opal_pmix_pmix2x_DEPENDENCIES="$OPAL_TOP_BUILDDIR/$opal_pmix_pmix2x_basedir/pmix/src/libpmix.la"
# and the flags for prun
OPAL_PMIX_CPPFLAGS="$opal_pmix_pmix2x_CPPFLAGS"
AC_SUBST(OPAL_PMIX_CPPFLAGS)
OPAL_PMIX_LDADD=$opal_pmix_pmix2x_LIBS
AC_SUBST(OPAL_PMIX_LDADD)
OPAL_PMIX_LIBS=
AC_SUBST(OPAL_PMIX_LIBS)
OPAL_PMIX_LDFLAGS=
AC_SUBST(OPAL_PMIX_LDFLAGS)])
opal_pmix_pmix2x_DEPENDENCIES="$OPAL_TOP_BUILDDIR/$opal_pmix_pmix2x_basedir/pmix/src/libpmix.la"])
AC_SUBST([opal_pmix_pmix2x_LIBS])
AC_SUBST([opal_pmix_pmix2x_CPPFLAGS])

Просмотреть файл

@ -30,7 +30,7 @@ greek=
# command, or with the date (if "git describe" fails) in the form of
# "date<date>".
repo_rev=gitdcf4faf
repo_rev=git2389189
# If tarball_version is not empty, it is used as the version string in
# the tarball filename, regardless of all other versions listed in
@ -44,7 +44,7 @@ tarball_version=
# The date when this release was created
date="Sep 13, 2017"
date="Sep 14, 2017"
# The shared library version of each of PMIx's public libraries.
# These versions are maintained in accordance with the "Library

Просмотреть файл

@ -9,10 +9,12 @@
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2006-2014 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2006-2016 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2013 Mellanox Technologies, Inc.
# All rights reserved.
# Copyright (c) 2015 Intel, Inc. All rights reserved.
# Copyright (c) 2015-2017 Intel, Inc. All rights reserved.
# Copyright (c) 2015 Research Organization for Information Science
# and Technology (RIST). All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
@ -61,7 +63,7 @@
# type: string (root path to install shell scripts)
%{!?shell_scripts_path: %define shell_scripts_path %{_bindir}}
# type: string (base name of the shell scripts)
%{!?shell_scripts_basename: %define shell_scripts_basename mpivars}
%{!?shell_scripts_basename: %define shell_scripts_basename pmixvars}
# Define this to 1 if you want this RPM to install a modulefile.
# type: bool (0/1)
@ -78,17 +80,6 @@
# type: string (name of modules RPM)
%{!?modules_rpm_name: %define modules_rpm_name environment-modules}
# Should we use the mpi-selector functionality?
# type: bool (0/1)
%{!?use_mpi_selector: %define use_mpi_selector 0}
# The name of the mpi-selector RPM. Can vary from system to system.
# type: string (name of mpi-selector RPM)
%{!?mpi_selector_rpm_name: %define mpi_selector_rpm_name mpi-selector}
# The location of the mpi-selector executable (can be a relative path
# name if "mpi-selector" can be found in the path)
# type: string (path to mpi-selector exectuable)
%{!?mpi_selector: %define mpi_selector mpi-selector}
# Should we build a debuginfo RPM or not?
# type: bool (0/1)
%{!?build_debuginfo_rpm: %define build_debuginfo_rpm 0}
@ -100,7 +91,7 @@
# Should we use the default "check_files" RPM step (i.e., check for
# unpackaged files)? It is discouraged to disable this, but some
# installers need it (e.g., older versions of OFED, because they
# installed lots of other stuff in the BUILD_ROOT before PMIx/SHMEM).
# installed lots of other stuff in the BUILD_ROOT before PMIx).
# type: bool (0/1)
%{!?use_check_files: %define use_check_files 1}
@ -125,7 +116,7 @@
# type: bool (0/1)
%{!?disable_auto_requires: %define disable_auto_requires 0}
# On some platforms, PMIx/SHMEM just flat-out doesn't work with
# On some platforms, PMIx just flat-out doesn't work with
# -D_FORTIFY_SOURCE (e.g., some users have reported that there are
# problems on ioa64 platforms). In this case, just turn it off
# (meaning: this specfile will strip out that flag from the
@ -152,7 +143,7 @@
%define _includedir /opt/%{name}/%{version}/include
%define _mandir /opt/%{name}/%{version}/man
# Note that the name "pmix" is hard-coded in
# opal/mca/installdirs/config for pkgdatadir; there is currently no
# src/mca/installdirs/config for pkgdatadir; there is currently no
# easy way to have PMIx change this directory name internally. So we
# just hard-code that name here as well (regardless of the value of
# %{name} or %{_name}).
@ -162,6 +153,8 @@
# bets are off. So feel free to install it anywhere in your tree. He
# suggests $prefix/doc.
%define _defaultdocdir /opt/%{name}/%{version}/doc
# Also put the modulefile in /opt.
%define modulefile_path /opt/%{name}/%{version}/share/pmixmodulefiles
%endif
%if !%{build_debuginfo_rpm}
@ -191,10 +184,6 @@
%define optflags ""
%endif
%if %{use_mpi_selector}
%define install_shell_scripts 1
%endif
#############################################################################
#
# Preamble Section
@ -212,7 +201,7 @@ Packager: %{?_packager:%{_packager}}%{!?_packager:%{_vendor}}
Vendor: %{?_vendorinfo:%{_vendorinfo}}%{!?_vendorinfo:%{_vendor}}
Distribution: %{?_distribution:%{_distribution}}%{!?_distribution:%{_vendor}}
Prefix: %{_prefix}
Provides: mpi
Provides: pmix
Provides: pmix = %{version}
BuildRoot: /var/tmp/%{name}-%{version}-%{release}-root
%if %{disable_auto_requires}
@ -221,9 +210,6 @@ AutoReq: no
%if %{install_modulefile}
Requires: %{modules_rpm_name}
%endif
%if %{use_mpi_selector}
Requires: %{mpi_selector_rpm_name}
%endif
%description
The Process Management Interface (PMI) has been used for quite some time as a
@ -340,9 +326,8 @@ fi
CFLAGS="%{?cflags:%{cflags}}%{!?cflags:$RPM_OPT_FLAGS}"
CXXFLAGS="%{?cxxflags:%{cxxflags}}%{!?cxxflags:$RPM_OPT_FLAGS}"
FFLAGS="%{?f77flags:%{f77flags}}%{!?f7flags:$RPM_OPT_FLAGS}"
FCFLAGS="%{?fcflags:%{fcflags}}%{!?fcflags:$RPM_OPT_FLAGS}"
export CFLAGS CXXFLAGS F77FLAGS FCFLAGS
export CFLAGS CXXFLAGS FCFLAGS
%configure %{configure_options}
%{__make} %{?mflags}
@ -369,14 +354,14 @@ cat <<EOF >$RPM_BUILD_ROOT/%{modulefile_path}/%{modulefile_subdir}/%{modulefile_
#%Module
# NOTE: This is an automatically-generated file! (generated by the
# PMIx/SHMEM RPM). Any changes made here will be lost a) if the RPM is
# PMIx RPM). Any changes made here will be lost a) if the RPM is
# uninstalled, or b) if the RPM is upgraded or uninstalled.
proc ModulesHelp { } {
puts stderr "This module adds PMIx/SHMEM v%{version} to various paths"
puts stderr "This module adds PMIx v%{version} to various paths"
}
module-whatis "Sets up PMIx/SHMEM v%{version} in your enviornment"
module-whatis "Sets up PMIx v%{version} in your enviornment"
prepend-path PATH "%{_prefix}/bin/"
prepend-path LD_LIBRARY_PATH %{_libdir}
@ -391,7 +376,7 @@ EOF
%{__mkdir_p} $RPM_BUILD_ROOT/%{shell_scripts_path}
cat <<EOF > $RPM_BUILD_ROOT/%{shell_scripts_path}/%{shell_scripts_basename}.sh
# NOTE: This is an automatically-generated file! (generated by the
# PMIx/SHMEM RPM). Any changes made here will be lost if the RPM is
# PMIx RPM). Any changes made here will be lost if the RPM is
# uninstalled or upgraded.
# PATH
@ -412,13 +397,10 @@ if test -z "\`echo \$MANPATH | grep %{_mandir}\`"; then
export MANPATH
fi
# MPI_ROOT
MPI_ROOT=%{_prefix}
export MPI_ROOT
EOF
cat <<EOF > $RPM_BUILD_ROOT/%{shell_scripts_path}/%{shell_scripts_basename}.csh
# NOTE: This is an automatically-generated file! (generated by the
# PMIx/SHMEM RPM). Any changes made here will be lost if the RPM is
# PMIx RPM). Any changes made here will be lost if the RPM is
# uninstalled or upgraded.
# path
@ -444,8 +426,6 @@ else
setenv MANPATH %{_mandir}:
endif
# MPI_ROOT
setenv MPI_ROOT %{_prefix}
EOF
%endif
# End of shell_scripts if
@ -465,30 +445,6 @@ rm -rf $RPM_BUILD_DIR/%{name}-%{version}
test "x$RPM_BUILD_ROOT" != "x" && rm -rf $RPM_BUILD_ROOT
#############################################################################
#
# Post Install Section
#
#############################################################################
%if %{use_mpi_selector}
%post
%{mpi_selector} \
--register %{name}-%{version} \
--source-dir %{shell_scripts_path} \
--yes
%endif
#############################################################################
#
# Pre Uninstall Section
#
#############################################################################
%if %{use_mpi_selector}
%preun
%{mpi_selector} --unregister %{name}-%{version} --yes || \
/bin/true > /dev/null 2> /dev/null
%endif
#############################################################################
#
# Files Section
@ -504,13 +460,20 @@ test "x$RPM_BUILD_ROOT" != "x" && rm -rf $RPM_BUILD_ROOT
%files
%defattr(-, root, root, -)
%if %(test "%{_prefix}" = "/usr" && echo 1 || echo 0)
#%{_bindir}/*
%{_includedir}/*
%{_libdir}/*
%{_datadir}
%else
%{_prefix}
# If the sysconfdir is not under the prefix, then list it explicitly.
%if !%{sysconfdir_in_prefix}
%{_sysconfdir}
%endif
# If %{install_in_opt}, then we're installing PMIx to
# /opt/pmix/<version>. But be sure to also explicitly mention
# If the sysconfdir is not under the prefix, then list it explicitly.
#%if !%{sysconfdir_in_prefix}
#%{_sysconfdir}/*
#%endif
# If %{install_in_opt}, then we're instaling PMIx to
# /opt/pmix<version>. But be sure to also explicitly mention
# /opt/pmix so that it can be removed by RPM when everything under
# there is also removed.
%if %{install_in_opt}
@ -527,14 +490,22 @@ test "x$RPM_BUILD_ROOT" != "x" && rm -rf $RPM_BUILD_ROOT
%endif
%doc README INSTALL LICENSE
#############################################################################
#
# Changelog
#
#############################################################################
%changelog
* Fri Jun 19 2015 Ralph H. Castain <rhc@open-mpi.org>
- Port to PMIx
* Tue Sep 12 2017 Ralph Castain <rhc@open-mpi.org>
- Port to pmix
* Tue Mar 28 2017 Jeff Squyres <jsquyres@cisco.com>
- Reverting a decision from a prior changelog entry: if
install_in_opt==1, then even put the modulefile under /opt.
* Thu Nov 12 2015 Gilles Gouaillardet <gilles@rist.or.jp>
- Revamp packaging when prefix is /usr
* Tue Jan 20 2015 Bert Wesarg <bert.wesarg@tu-dresden.de>
- Remove VampirTrace wrapper from package.
@ -545,7 +516,7 @@ test "x$RPM_BUILD_ROOT" != "x" && rm -rf $RPM_BUILD_ROOT
fields in case %{name} is overridden.
* Mon Jun 24 2013 Igor Ivanov <Igor.Ivanov@itseez.com>
- Add Open SHMEM parallel programming library as part of Open MPI
- Add Open parallel programming library as part of PMIx
* Tue Dec 11 2012 Jeff Squyres <jsquyres@cisco.com>
- Re-release 1.6.0-1.6.3 SRPMs (with new SRPM Release numbers) with
@ -593,7 +564,7 @@ test "x$RPM_BUILD_ROOT" != "x" && rm -rf $RPM_BUILD_ROOT
leave_build_root kludge nastyness. W00t!
* Fri Jan 18 2008 Jeff Squyres <jsquyres@cisco.com>
- Remove the hard-coded "pmix" name from two Requires statements
- Remove the hard-coded "openmpi" name from two Requires statements
and use %{name} instead (FWIW, %{_name} caused rpmbuild to barf).
* Wed Jan 2 2008 Jeff Squyres <jsquyres@cisco.com>
@ -683,4 +654,3 @@ test "x$RPM_BUILD_ROOT" != "x" && rm -rf $RPM_BUILD_ROOT
* Wed Mar 23 2005 Mezzanine <mezzanine@kainx.org>
- Specfile auto-generated by Mezzanine

Просмотреть файл

@ -9,7 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2016 Intel, Inc. All rights reserved
* Copyright (c) 2016-2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow

Просмотреть файл

@ -9,7 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2016 Intel, Inc. All rights reserved
* Copyright (c) 2016-2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow

Просмотреть файл

@ -510,10 +510,14 @@ pmix_status_t pmix_bfrops_base_value_xfer(pmix_value_t *p,
memcpy(&p->data.status, &src->data.status, sizeof(pmix_status_t));
break;
case PMIX_PROC:
memcpy(&p->data.proc, &src->data.proc, sizeof(pmix_proc_t));
PMIX_PROC_CREATE(p->data.proc, 1);
if (NULL == p->data.proc) {
return PMIX_ERR_NOMEM;
}
memcpy(p->data.proc, src->data.proc, sizeof(pmix_proc_t));
break;
case PMIX_PROC_RANK:
memcpy(&p->data.proc, &src->data.rank, sizeof(pmix_rank_t));
memcpy(&p->data.rank, &src->data.rank, sizeof(pmix_rank_t));
break;
case PMIX_BYTE_OBJECT:
case PMIX_COMPRESSED_STRING:

Просмотреть файл

@ -3,7 +3,7 @@
# Copyright (c) 2006 Los Alamos National Security, LLC. All rights
# reserved.
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2016 Intel, Inc. All rights reserved
# Copyright (c) 2016-2017 Intel, Inc. All rights reserved.
# Copyright (c) 2016 Research Organization for Information Science
# and Technology (RIST). All rights reserved.
# $COPYRIGHT$

Просмотреть файл

@ -3,7 +3,7 @@
# Copyright (c) 2006 Los Alamos National Security, LLC. All rights
# reserved.
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2016 Intel, Inc. All rights reserved
# Copyright (c) 2016-2017 Intel, Inc. All rights reserved.
# Copyright (c) 2016 Research Organization for Information Science
# and Technology (RIST). All rights reserved.
# $COPYRIGHT$

Просмотреть файл

@ -12,7 +12,7 @@
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2007-2013 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2016 IBM Corporation. All rights reserved.
* $COPYRIGHT$
*

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2016 Intel, Inc. All rights reserved.
* Copyright (c) 2016-2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow

Просмотреть файл

@ -99,6 +99,8 @@ const opal_pmix_base_module_t opal_pmix_pmix2x_module = {
.resolve_nodes = pmix2x_resolve_nodes,
.query = pmix2x_query,
.log = pmix2x_log,
.allocate = pmix2x_allocate,
.job_control = pmix2x_job_control,
/* server APIs */
.server_init = pmix2x_server_init,
.server_finalize = pmix2x_server_finalize,
@ -265,9 +267,7 @@ void pmix2x_event_hdlr(size_t evhdlr_registration_id,
} else {
if (OPAL_SUCCESS != (rc = opal_convert_string_to_jobid(&cd->pname.jobid, source->nspace))) {
OPAL_ERROR_LOG(rc);
OBJ_RELEASE(cd);
OPAL_PMIX_RELEASE_THREAD(&opal_pmix_base.lock);
return;
cd->pname.jobid = OPAL_NAME_INVALID->jobid;
}
cd->pname.vpid = pmix2x_convert_rank(source->rank);
}
@ -750,7 +750,7 @@ void pmix2x_value_load(pmix_value_t *v,
break;
case OPAL_STATUS:
v->type = PMIX_STATUS;
memcpy(&(v->data.status), &kv->data.status, sizeof(pmix_status_t));
v->data.status = pmix2x_convert_opalrc(kv->data.status);
break;
case OPAL_VPID:
v->type = PMIX_PROC_RANK;
@ -770,7 +770,7 @@ void pmix2x_value_load(pmix_value_t *v,
}
}
if (!found) {
(void)opal_snprintf_jobid(v->data.proc->nspace, PMIX_MAX_NSLEN, kv->data.name.vpid);
(void)opal_snprintf_jobid(v->data.proc->nspace, PMIX_MAX_NSLEN, kv->data.name.jobid);
}
v->data.proc->rank = pmix2x_convert_opalrank(kv->data.name.vpid);
break;
@ -925,7 +925,7 @@ int pmix2x_value_unload(opal_value_t *kv,
break;
case PMIX_STATUS:
kv->type = OPAL_STATUS;
memcpy(&kv->data.status, &(v->data.status), sizeof(opal_status_t));
kv->data.status = pmix2x_convert_rc(v->data.status);
break;
case PMIX_PROC_RANK:
kv->type = OPAL_VPID;
@ -1185,14 +1185,7 @@ static int notify_event(int status,
n=0;
OPAL_LIST_FOREACH(kv, info, opal_value_t) {
(void)strncpy(op->info[n].key, kv->key, PMIX_MAX_KEYLEN);
/* little dicey here as we need to convert a status, if
* provided, and it will be an int coming down to us */
if (0 == strcmp(kv->key, OPAL_PMIX_JOB_TERM_STATUS)) {
op->info[n].value.type = PMIX_STATUS;
op->info[n].value.data.status = pmix2x_convert_opalrc(kv->data.integer);
} else {
pmix2x_value_load(&op->info[n].value, kv);
}
++n;
}
}

Просмотреть файл

@ -226,6 +226,9 @@ int pmix2x_tool_init(opal_list_t *info)
pinfo = NULL;
ninfo = 0;
}
/* we are going to get our name from the server, or we were given it by the tool,
* so mark as native launch so we don't convert back/forth */
mca_pmix_pmix2x_component.native_launch = true;
OPAL_PMIX_RELEASE_THREAD(&opal_pmix_base.lock);
rc = PMIx_tool_init(&my_proc, pinfo, ninfo);
@ -245,20 +248,10 @@ int pmix2x_tool_init(opal_list_t *info)
return OPAL_SUCCESS;
}
if (OPAL_JOBID_INVALID == pname.jobid) {
/* store our jobid and rank */
if (NULL != getenv(OPAL_MCA_PREFIX"orte_launch")) {
/* if we were launched by the OMPI RTE, then
* the jobid is in a special format - so get it */
mca_pmix_pmix2x_component.native_launch = true;
opal_convert_string_to_jobid(&pname.jobid, my_proc.nspace);
} else {
/* we were launched by someone else, so make the
* jobid just be the hash of the nspace */
OPAL_HASH_JOBID(my_proc.nspace, pname.jobid);
}
pname.vpid = pmix2x_convert_rank(my_proc.rank);
}
/* insert this into our list of jobids - it will be the
* first, and so we'll check it first */
job = OBJ_NEW(opal_pmix2x_jobid_trkr_t);
@ -1154,6 +1147,9 @@ int pmix2x_spawn(opal_list_t *job_info, opal_list_t *apps, opal_jobid_t *jobid)
if (NULL != app->env) {
papps[n].env = opal_argv_copy(app->env);
}
if (NULL != app->cwd) {
papps[n].cwd = strdup(app->cwd);
}
papps[n].maxprocs = app->maxprocs;
if (0 < (papps[n].ninfo = opal_list_get_size(&app->info))) {
PMIX_INFO_CREATE(papps[n].info, papps[n].ninfo);

Просмотреть файл

@ -93,7 +93,7 @@ static int
opal_err2str(int errnum, const char **errmsg)
{
const char *retval;
opal_output(0, "OPAL ERR2STR %d", errnum);
switch (errnum) {
case OPAL_SUCCESS:
retval = "Success";

Просмотреть файл

@ -111,7 +111,7 @@ static int rte_init(void)
if (ORTE_PROC_IS_TOOL) {
/* otherwise, if I am a tool proc, use that procedure */
if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup())) {
if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup(0))) {
ORTE_ERROR_LOG(ret);
error = "orte_ess_base_tool_setup";
goto fn_fail;

Просмотреть файл

@ -65,7 +65,7 @@ ORTE_DECLSPEC int orte_ess_base_app_setup(bool db_restrict_local);
ORTE_DECLSPEC int orte_ess_base_app_finalize(void);
ORTE_DECLSPEC void orte_ess_base_app_abort(int status, bool report);
ORTE_DECLSPEC int orte_ess_base_tool_setup(void);
ORTE_DECLSPEC int orte_ess_base_tool_setup(uint8_t flags);
ORTE_DECLSPEC int orte_ess_base_tool_finalize(void);
ORTE_DECLSPEC int orte_ess_base_orted_setup(void);

Просмотреть файл

@ -38,20 +38,18 @@
#include "opal/mca/pmix/base/base.h"
#include "opal/runtime/opal.h"
#include "opal/runtime/opal_cr.h"
#include "opal/runtime/opal_progress_threads.h"
#include "opal/util/arch.h"
#include "opal/util/proc.h"
#include "orte/mca/oob/base/base.h"
#include "orte/mca/plm/base/base.h"
#include "orte/mca/rml/base/base.h"
#include "orte/mca/rml/base/rml_contact.h"
#include "orte/mca/routed/base/base.h"
#include "orte/mca/errmgr/base/base.h"
#include "orte/mca/iof/base/base.h"
#include "orte/mca/state/base/base.h"
#if OPAL_ENABLE_FT_CR == 1
#include "orte/mca/snapc/base/base.h"
#include "orte/mca/sstore/base/base.h"
#endif
#include "orte/util/proc_info.h"
#include "orte/util/session_dir.h"
#include "orte/util/show_help.h"
@ -63,13 +61,51 @@
#include "orte/mca/ess/base/base.h"
int orte_ess_base_tool_setup(void)
static void infocb(int status,
opal_list_t *info,
void *cbdata,
opal_pmix_release_cbfunc_t release_fn,
void *release_cbdata)
{
opal_value_t *kv;
opal_pmix_lock_t *lock = (opal_pmix_lock_t*)cbdata;
if (OPAL_SUCCESS != status) {
ORTE_ERROR_LOG(status);
} else {
kv = (opal_value_t*)opal_list_get_first(info);
if (NULL == kv) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_SUPPORTED);
} else {
if (0 == strcmp(kv->key, OPAL_PMIX_SERVER_URI)) {
orte_process_info.my_hnp_uri = strdup(kv->data.string);
} else {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
}
}
}
if (NULL != release_fn) {
release_fn(release_cbdata);
}
OPAL_PMIX_WAKEUP_THREAD(lock);
}
int orte_ess_base_tool_setup(uint8_t flags)
{
int ret;
char *error = NULL;
opal_list_t transports;
orte_jobid_t jobid;
orte_vpid_t vpid;
opal_list_t info;
opal_value_t *kv, val;
opal_pmix_query_t *q;
opal_pmix_lock_t lock;
opal_buffer_t *buf;
/* we need an external progress thread to ensure that things run
* async with the PMIx code */
orte_event_base = opal_progress_thread_init("tool");
/* setup the PMIx framework - ensure it skips all non-PMIx components,
* but do not override anything we were given */
@ -84,7 +120,13 @@ int orte_ess_base_tool_setup(void)
error = "opal_pmix_base_select";
goto error;
}
/* set the event base */
if (NULL == opal_pmix.tool_init) {
/* we no longer support non-pmix tools */
error = "opal_pmix.tool_init";
ret = ORTE_ERR_NOT_SUPPORTED;
goto error;
}
/* set the event base for the pmix component code */
opal_pmix_base_set_evbase(orte_event_base);
/* we have to define our name here */
@ -126,9 +168,6 @@ int orte_ess_base_tool_setup(void)
/* initialize - PMIx may set our name here if we attach to
* a PMIx server */
if (NULL != opal_pmix.tool_init) {
opal_list_t info;
opal_value_t *kv;
OBJ_CONSTRUCT(&info, opal_list_t);
/* pass our name so the PMIx layer can use it */
kv = OBJ_NEW(opal_value_t);
@ -142,13 +181,27 @@ int orte_ess_base_tool_setup(void)
kv->data.name.vpid = ORTE_PROC_MY_NAME->vpid;
kv->type = OPAL_VPID;
opal_list_append(&info, &kv->super);
/* ORTE tools don't need to connect to a PMIx server as
* they will connect via the OOB */
if (0 != flags) {
/* instruct the PMIx layer on if/how to connect */
kv = OBJ_NEW(opal_value_t);
if (0x01 == flags) {
kv->key = strdup(OPAL_PMIX_TOOL_DO_NOT_CONNECT);
} else if (0x02 == flags) {
kv->key = strdup(OPAL_PMIX_CONNECT_SYSTEM_FIRST);
} else if (0x04 == flags) {
kv->key = strdup(OPAL_PMIX_CONNECT_TO_SYSTEM);
} else {
opal_output(0, "UNKNOWN CONNECTION FLAG %0x", flags);
error = "unknown connection flags";
ret = ORTE_ERR_BAD_PARAM;
OPAL_LIST_DESTRUCT(&info);
OBJ_RELEASE(kv);
goto error;
}
kv->data.flag = true;
kv->type = OPAL_BOOL;
opal_list_append(&info, &kv->super);
}
if (OPAL_SUCCESS != (ret = opal_pmix.tool_init(&info))) {
ORTE_ERROR_LOG(ret);
error = "opal_pmix.init";
@ -158,12 +211,25 @@ int orte_ess_base_tool_setup(void)
OPAL_LIST_DESTRUCT(&info);
ORTE_PROC_MY_NAME->jobid = OPAL_PROC_MY_NAME.jobid;
ORTE_PROC_MY_NAME->vpid = OPAL_PROC_MY_NAME.vpid;
}
orte_process_info.super.proc_hostname = strdup(orte_process_info.nodename);
orte_process_info.super.proc_flags = OPAL_PROC_ALL_LOCAL;
orte_process_info.super.proc_arch = opal_local_arch;
opal_proc_local_set(&orte_process_info.super);
if (NULL != opal_pmix.query) {
/* query the server for its URI so we can get any IO forwarded to us */
OBJ_CONSTRUCT(&info, opal_list_t);
q = OBJ_NEW(opal_pmix_query_t);
opal_argv_append_nosize(&q->keys, OPAL_PMIX_SERVER_URI);
opal_list_append(&info, &q->super);
OPAL_PMIX_CONSTRUCT_LOCK(&lock);
opal_pmix.query(&info, infocb, &lock);
OPAL_PMIX_WAIT_THREAD(&lock);
OPAL_PMIX_DESTRUCT_LOCK(&lock);
OPAL_LIST_DESTRUCT(&info);
}
/* open and setup the state machine */
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_state_base_framework, 0))) {
ORTE_ERROR_LOG(ret);
@ -227,12 +293,6 @@ int orte_ess_base_tool_setup(void)
orte_mgmt_conduit = orte_rml.open_conduit(&transports);
OPAL_LIST_DESTRUCT(&transports);
/* since I am a tool, then all I really want to do is communicate.
* So setup communications and be done - finding the HNP
* to which I want to communicate and setting up a route for
* that link is my responsibility
*/
/* we -may- need to know the name of the head
* of our session directory tree, particularly the
* tmp base where any other session directories on
@ -248,7 +308,52 @@ int orte_ess_base_tool_setup(void)
/* setup I/O forwarding system - must come after we init routes */
if (NULL != orte_process_info.my_hnp_uri) {
/* only do this if we were given an HNP */
/* extract the name */
if (ORTE_SUCCESS != orte_rml_base_parse_uris(orte_process_info.my_hnp_uri, ORTE_PROC_MY_HNP, NULL)) {
orte_show_help("help-orte-top.txt", "orte-top:hnp-uri-bad", true, orte_process_info.my_hnp_uri);
exit(1);
}
/* Set the contact info in the RML - this won't actually establish
* the connection, but just tells the RML how to reach the HNP
* if/when we attempt to send to it
*/
OBJ_CONSTRUCT(&val, opal_value_t);
val.key = OPAL_PMIX_PROC_URI;
val.type = OPAL_STRING;
val.data.string = orte_process_info.my_hnp_uri;
if (OPAL_SUCCESS != (ret = opal_pmix.store_local(ORTE_PROC_MY_HNP, &val))) {
ORTE_ERROR_LOG(ret);
val.key = NULL;
val.data.string = NULL;
OBJ_DESTRUCT(&val);
error = "store HNP URI";
goto error;
}
val.key = NULL;
val.data.string = NULL;
OBJ_DESTRUCT(&val);
/* set the route to be direct */
if (ORTE_SUCCESS != orte_routed.update_route(NULL, ORTE_PROC_MY_HNP, ORTE_PROC_MY_HNP)) {
orte_show_help("help-orte-top.txt", "orte-top:hnp-uri-bad", true, orte_process_info.my_hnp_uri);
orte_finalize();
exit(1);
}
/* connect to the HNP so we can recv forwarded output */
buf = OBJ_NEW(opal_buffer_t);
ret = orte_rml.send_buffer_nb(orte_mgmt_conduit, ORTE_PROC_MY_HNP,
buf, ORTE_RML_TAG_WARMUP_CONNECTION,
orte_rml_send_callback, NULL);
if (ORTE_SUCCESS != ret) {
ORTE_ERROR_LOG(ret);
error = "warmup connection";
goto error;
}
/* set the target hnp as our lifeline so we will terminate if it exits */
orte_routed.set_lifeline(NULL, ORTE_PROC_MY_HNP);
/* setup the IOF */
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_iof_base_framework, 0))) {
ORTE_ERROR_LOG(ret);
error = "orte_iof_base_open";
@ -259,46 +364,8 @@ int orte_ess_base_tool_setup(void)
error = "orte_iof_base_select";
goto error;
}
/* if we were given an HNP, then also setup the PLM in case this
* tool wants to request that we spawn something for it */
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_plm_base_framework, 0))) {
ORTE_ERROR_LOG(ret);
error = "orte_plm_base_open";
goto error;
}
/* we don't select the plm framework as we only want the
* base proxy functions */
}
#if OPAL_ENABLE_FT_CR == 1
/*
* Setup the SnapC
*/
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_snapc_base_framework, 0))) {
ORTE_ERROR_LOG(ret);
error = "orte_snapc_base_open";
goto error;
}
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_sstore_base_framework, 0))) {
ORTE_ERROR_LOG(ret);
error = "orte_sstore_base_open";
goto error;
}
if (ORTE_SUCCESS != (ret = orte_snapc_base_select(ORTE_PROC_IS_HNP, ORTE_PROC_IS_APP))) {
ORTE_ERROR_LOG(ret);
error = "orte_snapc_base_select";
goto error;
}
if (ORTE_SUCCESS != (ret = orte_sstore_base_select())) {
ORTE_ERROR_LOG(ret);
error = "orte_sstore_base_select";
goto error;
}
/* Tools do not need all the OPAL CR stuff */
opal_cr_set_enabled(false);
#endif
return ORTE_SUCCESS;
@ -314,11 +381,6 @@ int orte_ess_base_tool_finalize(void)
{
orte_wait_finalize();
#if OPAL_ENABLE_FT_CR == 1
mca_base_framework_close(&orte_snapc_base_framework);
mca_base_framework_close(&orte_sstore_base_framework);
#endif
orte_rml.close_conduit(orte_mgmt_conduit);
/* if I am a tool, then all I will have done is

Просмотреть файл

@ -92,7 +92,7 @@ static int rte_init(void)
if (ORTE_PROC_IS_TOOL) {
/* otherwise, if I am a tool proc, use that procedure */
if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup())) {
if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup(0))) {
ORTE_ERROR_LOG(ret);
error = "orte_ess_base_tool_setup";
goto error;

Просмотреть файл

@ -87,7 +87,7 @@ static int rte_init(void)
if (ORTE_PROC_IS_TOOL) {
/* otherwise, if I am a tool proc, use that procedure */
if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup())) {
if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup(0))) {
ORTE_ERROR_LOG(ret);
error = "orte_ess_base_tool_setup";
goto error;

Просмотреть файл

@ -91,7 +91,7 @@ static int rte_init(void)
if (ORTE_PROC_IS_TOOL) {
/* otherwise, if I am a tool proc, use that procedure */
if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup())) {
if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup(0))) {
ORTE_ERROR_LOG(ret);
error = "orte_ess_base_tool_setup";
goto error;

Просмотреть файл

@ -9,7 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2015 Intel, Inc. All rights reserved
* Copyright (c) 2015-2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -32,6 +32,9 @@ int orte_ess_tool_component_query(mca_base_module_t **module, int *priority);
typedef struct {
orte_ess_base_component_t super;
bool async;
bool system_server_first;
bool system_server_only;
bool do_not_connect;
} orte_ess_tool_component_t;
ORTE_MODULE_DECLSPEC extern orte_ess_tool_component_t mca_ess_tool_component;

Просмотреть файл

@ -12,7 +12,7 @@
* All rights reserved.
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2015 Intel, Inc. All rights reserved
* Copyright (c) 2015-2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -76,6 +76,30 @@ static int tool_component_register(void)
OPAL_INFO_LVL_2,
MCA_BASE_VAR_SCOPE_READONLY,
&mca_ess_tool_component.async);
mca_ess_tool_component.do_not_connect = false;
(void) mca_base_component_var_register (c, "do_not_connect",
"Do not connect to a PMIx server",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
OPAL_INFO_LVL_2,
MCA_BASE_VAR_SCOPE_READONLY,
&mca_ess_tool_component.do_not_connect);
mca_ess_tool_component.system_server_first = false;
(void) mca_base_component_var_register (c, "system_server_first",
"Look for a system PMIx server first",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
OPAL_INFO_LVL_2,
MCA_BASE_VAR_SCOPE_READONLY,
&mca_ess_tool_component.system_server_first);
mca_ess_tool_component.system_server_only = false;
(void) mca_base_component_var_register (c, "system_server_only",
"Only connect to a system server (and not an mpirun)",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
OPAL_INFO_LVL_2,
MCA_BASE_VAR_SCOPE_READONLY,
&mca_ess_tool_component.system_server_only);
return ORTE_SUCCESS;
}
@ -111,4 +135,3 @@ orte_ess_tool_component_close(void)
{
return ORTE_SUCCESS;
}

Просмотреть файл

@ -63,6 +63,7 @@ static int rte_init(void)
{
int ret;
char *error = NULL;
uint8_t flags;
/* run the prolog */
if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) {
@ -79,8 +80,18 @@ static int rte_init(void)
progress_thread_running = true;
}
/* setup the tool connection flags */
flags = 0;
if (mca_ess_tool_component.do_not_connect) {
flags = 0x01;
} else if (mca_ess_tool_component.system_server_first) {
flags = 0x02;
} else if (mca_ess_tool_component.system_server_only) {
flags = 0x04;
}
/* do the standard tool init */
if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup())) {
if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup(flags))) {
ORTE_ERROR_LOG(ret);
error = "orte_ess_base_tool_setup";
goto error;

Просмотреть файл

@ -142,7 +142,7 @@ BEGIN_C_DECLS
buf = OBJ_NEW(opal_buffer_t); \
\
/* setup the tag to pull from HNP */ \
tag = ORTE_IOF_STDOUTALL | ORTE_IOF_PULL; \
tag = ORTE_IOF_STDOUTALL | ORTE_IOF_PULL | ORTE_IOF_EXCLUSIVE; \
opal_dss.pack(buf, &tag, 1, ORTE_IOF_TAG); \
/* pack the name of the source we want to pull */ \
nm.jobid = (a)->jobid; \

Просмотреть файл

@ -387,6 +387,7 @@ void orte_plm_base_complete_setup(int fd, short args, void *cbdata)
orte_vpid_t *vptr;
int i, rc;
char *serial_number;
orte_process_name_t requestor, *rptr;
ORTE_ACQUIRE_OBJECT(caddy);
@ -425,7 +426,12 @@ void orte_plm_base_complete_setup(int fd, short args, void *cbdata)
* indicating that request */
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_FWDIO_TO_TOOL, NULL, OPAL_BOOL)) {
/* send a message to our IOF containing the requested pull */
rptr = &requestor;
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_LAUNCH_PROXY, (void**)&rptr, OPAL_NAME)) {
ORTE_IOF_PROXY_PULL(jdata, rptr);
} else {
ORTE_IOF_PROXY_PULL(jdata, &jdata->originator);
}
/* the tool will PUSH its stdin, so nothing we need to do here
* about stdin */
}

Просмотреть файл

@ -240,6 +240,9 @@ int pmix_server_spawn_fn(opal_process_name_t *requestor,
} else if (0 == strcmp(info->key, OPAL_PMIX_REQUESTOR_IS_TOOL)) {
orte_set_attribute(&jdata->attributes, ORTE_JOB_DVM_JOB,
ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL);
/* request that IO be forwarded to the requesting tool */
orte_set_attribute(&jdata->attributes, ORTE_JOB_FWDIO_TO_TOOL,
ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL);
} else if (0 == strcmp(info->key, OPAL_PMIX_STDIN_TGT)) {
if (0 == strcmp(info->data.string, "all")) {
jdata->stdin_target = ORTE_VPID_WILDCARD;

Просмотреть файл

@ -676,6 +676,13 @@ static void _query(int sd, short args, void *cbdata)
opal_list_append(results, &kv->super);
}
#endif
} else if (0 == strcmp(q->keys[n], OPAL_PMIX_SERVER_URI)) {
/* they want our URI */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_SERVER_URI);
kv->type = OPAL_STRING;
kv->data.string = strdup(orte_process_info.my_hnp_uri);
opal_list_append(results, &kv->super);
}
}
}

Просмотреть файл

@ -53,9 +53,7 @@ DIST_SUBDIRS += \
tools/prun
if OPAL_WANT_PRUN
if WANT_INSTALL_HEADERS
SUBDIRS += \
tools/prun \
tools/orte-dvm
endif
endif

Просмотреть файл

@ -502,6 +502,14 @@ static void notify_requestor(int sd, short args, void *cbdata)
if (notify) {
info = OBJ_NEW(opal_list_t);
/* ensure this only goes to the job terminated event handler */
val = OBJ_NEW(opal_value_t);
val->key = strdup(OPAL_PMIX_EVENT_NON_DEFAULT);
val->type = OPAL_BOOL;
val->data.flag = true;
opal_list_append(info, &val->super);
/* tell the server not to cache the event as subsequent jobs
* do not need to know about it */
val = OBJ_NEW(opal_value_t);
val->key = strdup(OPAL_PMIX_EVENT_DO_NOT_CACHE);
val->type = OPAL_BOOL;
@ -510,15 +518,20 @@ static void notify_requestor(int sd, short args, void *cbdata)
/* provide the status */
val = OBJ_NEW(opal_value_t);
val->key = strdup(OPAL_PMIX_JOB_TERM_STATUS);
val->type = OPAL_INT;
val->data.integer = ret;
val->type = OPAL_STATUS;
val->data.status = ret;
opal_list_append(info, &val->super);
/* if there was a problem, we need to send the requestor more info about what happened */
if (0 < ret) {
val = OBJ_NEW(opal_value_t);
val->key = strdup(OPAL_PMIX_PROCID);
val->type = OPAL_NAME;
val->data.name = pptr->name;
val->data.name.jobid = jdata->jobid;
if (NULL != pptr) {
val->data.name.vpid = pptr->name.vpid;
} else {
val->data.name.vpid = ORTE_VPID_WILDCARD;
}
opal_list_append(info, &val->super);
}
opal_pmix.notify_event(OPAL_ERR_JOB_TERMINATED, NULL,

Просмотреть файл

@ -26,10 +26,7 @@
# post-processed forms of the CFLAGS in the library targets down
# below.
AM_CPPFLAGS = $(OPAL_PMIX_CPPFLAGS)
CFLAGS = $(CFLAGS_WITHOUT_OPTFLAGS) $(DEBUGGER_CFLAGS)
AM_LDFLAGS = $(OPAL_PMIX_LDFLAGS)
include $(top_srcdir)/Makefile.ompi-rules
@ -56,10 +53,7 @@ prun_SOURCES = \
prun_LDADD = \
$(top_builddir)/orte/lib@ORTE_LIB_PREFIX@open-rte.la \
$(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la \
$(OPAL_PMIX_LDADD)
prun_LIBS = $(OPAL_PMIX_LIBS)
$(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la
distclean-local:
rm -f $(man_pages)

Просмотреть файл

@ -69,22 +69,23 @@
#include "opal/util/show_help.h"
#include "opal/util/fd.h"
#include "opal/sys/atomic.h"
#if OPAL_ENABLE_FT_CR == 1
#include "opal/runtime/opal_cr.h"
#endif
#include "opal/version.h"
#include "opal/runtime/opal.h"
#include "opal/runtime/opal_info_support.h"
#include "opal/runtime/opal_progress_threads.h"
#include "opal/util/os_path.h"
#include "opal/util/path.h"
#include "opal/class/opal_pointer_array.h"
#include "opal/dss/dss.h"
#include "orte/runtime/runtime.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/state/state.h"
/* ensure I can behave like a daemon */
#include "prun.h"
#include <include/pmix.h>
#include <include/pmix_tool.h>
/**
* Global struct for caching orte command line options.
@ -143,7 +144,7 @@ typedef struct orte_cmd_options_t orte_cmd_options_t;
static orte_cmd_options_t orte_cmd_options = {0};
static opal_cmd_line_t *orte_cmd_line = NULL;
static opal_list_t job_info;
static opal_pmix_lock_t globallock;
static volatile bool active = false;
static int create_app(int argc, char* argv[],
opal_list_t *jdata,
@ -476,10 +477,10 @@ static opal_cmd_line_init_t cmd_line_init[] = {
};
static void infocb(pmix_status_t status,
pmix_info_t *info, size_t ninfo,
static void infocb(int status,
opal_list_t *info,
void *cbdata,
pmix_release_cbfunc_t release_fn,
opal_pmix_release_cbfunc_t release_fn,
void *release_cbdata)
{
opal_pmix_lock_t *lock = (opal_pmix_lock_t*)cbdata;
@ -491,35 +492,42 @@ static void infocb(pmix_status_t status,
OPAL_PMIX_WAKEUP_THREAD(lock);
}
static void regcbfunc(pmix_status_t status, size_t ref, void *cbdata)
static void regcbfunc(int status, size_t ref, void *cbdata)
{
opal_pmix_lock_t *lock = (opal_pmix_lock_t*)cbdata;
OPAL_ACQUIRE_OBJECT(lock);
OPAL_PMIX_WAKEUP_THREAD(lock);
}
static void evhandler(size_t evhdlr_registration_id,
pmix_status_t status,
const pmix_proc_t *source,
pmix_info_t info[], size_t ninfo,
pmix_info_t *results, size_t nresults,
pmix_event_notification_cbfunc_fn_t cbfunc,
static void release(int sd, short args, void *cbdata)
{
active = false;
}
static bool fired = false;
static void evhandler(int status,
const opal_process_name_t *source,
opal_list_t *info, opal_list_t *results,
opal_pmix_notification_complete_fn_t cbfunc,
void *cbdata)
{
size_t n;
opal_value_t *val;
if (NULL != info) {
for (n=0; n < ninfo; n++) {
if (0 == strncmp(info[n].key, PMIX_JOB_TERM_STATUS, PMIX_MAX_KEYLEN)) {
opal_output(0, "JOB COMPLETED WITH STATUS %s", PMIx_Error_string(info[n].value.data.status));
OPAL_LIST_FOREACH(val, info, opal_value_t) {
if (0 == strcmp(val->key, OPAL_PMIX_JOB_TERM_STATUS)) {
opal_output(0, "JOB COMPLETED WITH STATUS %d",
val->data.integer);
}
}
}
if (NULL != cbfunc) {
cbfunc(PMIX_SUCCESS, NULL, 0, NULL, NULL, cbdata);
cbfunc(OPAL_SUCCESS, NULL, NULL, NULL, cbdata);
}
if (!fired) {
fired = true;
ORTE_ACTIVATE_PROC_STATE(ORTE_PROC_MY_NAME, ORTE_PROC_STATE_TERMINATED);
}
OPAL_ACQUIRE_OBJECT(&globallock);
OPAL_PMIX_WAKEUP_THREAD(&globallock);
}
@ -530,14 +538,9 @@ int prun(int argc, char *argv[])
opal_pmix_lock_t lock;
opal_list_t apps;
opal_value_t *val;
opal_pmix_app_t *app;
pmix_status_t code;
char nspace[PMIX_MAX_NSLEN+1];
pmix_info_t info;
pmix_proc_t myproc;
size_t asz, jsz;
pmix_app_t *papps = NULL;
pmix_info_t *pinfo = NULL;
opal_list_t info;
opal_jobid_t jobid;
struct timespec tp = {0, 100000};
/* init the globals */
memset(&orte_cmd_options, 0, sizeof(orte_cmd_options));
@ -644,106 +647,85 @@ int prun(int argc, char *argv[])
return rc;
}
/* use the system connection first, if available */
PMIX_INFO_LOAD(&info, OPAL_PMIX_CONNECT_SYSTEM_FIRST, NULL, PMIX_BOOL);
/* init as a tool */
if (OPAL_SUCCESS != PMIx_tool_init(&myproc, &info, 1)) {
fprintf(stderr, "Unable to init as tool\n");
exit(1);
/* tell the ess/tool component that we want to connect to a system-level
* PMIx server */
opal_setenv("OMPI_MCA_ess_tool_system_server_only", "1", true, &environ);
/* now initialize ORTE */
if (OPAL_SUCCESS != (rc = orte_init(&argc, &argv, ORTE_PROC_TOOL))) {
OPAL_ERROR_LOG(rc);
return rc;
}
PMIX_INFO_DESTRUCT(&info);
/* if the user just wants us to terminate a DVM, then do so */
if (orte_cmd_options.terminate_dvm) {
PMIX_INFO_LOAD(&info, OPAL_PMIX_JOB_CTRL_TERMINATE, NULL, PMIX_BOOL);
OBJ_CONSTRUCT(&info, opal_list_t);
val = OBJ_NEW(opal_value_t);
val->key = strdup(OPAL_PMIX_JOB_CTRL_TERMINATE);
val->type = OPAL_BOOL;
val->data.flag = true;
opal_list_append(&info, &val->super);
fprintf(stderr, "TERMINATING DVM...");
OPAL_PMIX_CONSTRUCT_LOCK(&lock);
rc = PMIx_Job_control_nb(NULL, 0, &info, 1, infocb, (void*)&lock);
rc = opal_pmix.job_control(NULL, &info, infocb, (void*)&lock);
OPAL_PMIX_WAIT_THREAD(&lock);
OPAL_PMIX_DESTRUCT_LOCK(&lock);
PMIX_INFO_DESTRUCT(&info);
OPAL_LIST_DESTRUCT(&info);
fprintf(stderr, "DONE\n");
goto DONE;
}
orte_state.add_proc_state(ORTE_PROC_STATE_TERMINATED, release, ORTE_SYS_PRI);
/* get here if they want to run an application, so let's parse
* the cmd line to get it */
if (OPAL_SUCCESS != parse_locals(&apps, argc, argv)) {
opal_output(0, "[%s:%d] SOMETHING WRONG", __FILE__, __LINE__);
if (OPAL_SUCCESS != (rc = parse_locals(&apps, argc, argv))) {
OPAL_ERROR_LOG(rc);
OPAL_LIST_DESTRUCT(&apps);
goto DONE;
}
/* bozo check */
if (0 == (asz = opal_list_get_size(&apps))) {
opal_output(0, "[%s:%d] SOMETHING WRONG", __FILE__, __LINE__);
if (0 == opal_list_get_size(&apps)) {
opal_output(0, "No application specified!");
goto DONE;
}
/* init flag */
active = true;
/* register for job terminations so we get notified when
* our job completes */
OPAL_PMIX_CONSTRUCT_LOCK(&lock);
code = PMIX_ERR_JOB_TERMINATED;
PMIx_Register_event_handler(&code, 1, NULL, 0, evhandler, regcbfunc, &lock);
OBJ_CONSTRUCT(&info, opal_list_t);
val = OBJ_NEW(opal_value_t);
val->key = strdup("foo");
val->type = OPAL_INT;
val->data.integer = OPAL_ERR_JOB_TERMINATED;
opal_list_append(&info, &val->super);
opal_pmix.register_evhandler(&info, NULL, evhandler, regcbfunc, &lock);
OPAL_PMIX_WAIT_THREAD(&lock);
OPAL_PMIX_DESTRUCT_LOCK(&lock);
OPAL_LIST_DESTRUCT(&info);
/* convert the job info and apps to PMIx arrays */
if (0 < (jsz = opal_list_get_size(&job_info))) {
PMIX_INFO_CREATE(pinfo, jsz);
i=0;
OPAL_LIST_FOREACH(val, &job_info, opal_value_t) {
(void)strncpy(pinfo[i].key, val->key, PMIX_MAX_KEYLEN);
/* we only have bool and string types here */
if (OPAL_BOOL == val->type) {
pinfo[i].value.type = PMIX_BOOL;
pinfo[i].value.data.flag = val->data.flag;
} else if (OPAL_STRING == val->type) {
pinfo[i].value.type = PMIX_STRING;
pinfo[i].value.data.string = strdup(val->data.string);
} else {
opal_output(0, "UNSUPPORTED TYPE %d", val->type);
}
++i;
}
}
OPAL_LIST_DESTRUCT(&job_info);
PMIX_APP_CREATE(papps, asz);
i=0;
OPAL_LIST_FOREACH(app, &apps, opal_pmix_app_t) {
papps[i].cmd = strdup(app->cmd);
papps[i].argv = opal_argv_copy(app->argv);
papps[i].env = opal_argv_copy(app->env);
if (NULL != app->cwd) {
papps[i].cwd = strdup(app->cwd);
}
papps[i].maxprocs = app->maxprocs;
++i;
}
OPAL_LIST_DESTRUCT(&apps);
OPAL_PMIX_CONSTRUCT_LOCK(&globallock);
if (PMIX_SUCCESS != PMIx_Spawn(pinfo, jsz, papps, asz, nspace)) {
opal_output(0, "[%s:%d] SOMETHING WRONG", __FILE__, __LINE__);
OPAL_PMIX_DESTRUCT_LOCK(&globallock);
if (OPAL_SUCCESS != (rc = opal_pmix.spawn(&job_info, &apps, &jobid))) {
opal_output(0, "Job failed to spawn: %s", opal_strerror(rc));
goto DONE;
}
opal_output(0, "JOB %s EXECUTING", nspace);
OPAL_PMIX_WAIT_THREAD(&globallock);
OPAL_PMIX_DESTRUCT_LOCK(&globallock);
if (NULL != pinfo) {
PMIX_INFO_FREE(pinfo, jsz);
}
if (NULL != papps) {
PMIX_APP_FREE(papps, asz);
OPAL_LIST_DESTRUCT(&job_info);
OPAL_LIST_DESTRUCT(&apps);
opal_output(0, "JOB %s EXECUTING", OPAL_JOBID_PRINT(jobid));
while (active) {
nanosleep(&tp, NULL);
}
DONE:
/* cleanup and leave */
PMIx_tool_finalize();
opal_finalize();
orte_finalize();
return 0;
}