1
1

Add a new OMPI rte component to support direct-launch using PMIx.

Cleanup several places where abstraction violations crept into OMPI layer (direct reference of ORTE). Add some missing includes that were exposed by this change.

Note that this compiles, but I haven't tested it for execution yet. Handing it over to Noah Evans for completion

Signed-off-by: Ralph Castain <rhc@open-mpi.org>
This commit is contained in:
Ralph Castain 2017-11-28 11:42:36 -08:00
parent c4c5df4d1c
commit 7ad6886a30
13 changed files with 1086 additions and 81 deletions

View File

@ -158,7 +158,7 @@ int ompi_comm_init(void)
in the most generic sense. This is used by OMPIO for deciding which
ranks to use for aggregators
*/
opal_process_name_t wildcard = {ORTE_PROC_MY_NAME->jobid, OPAL_VPID_WILDCARD};
opal_process_name_t wildcard = {OMPI_PROC_MY_NAME->jobid, OPAL_VPID_WILDCARD};
char *str=NULL;
int rc;

View File

@ -16,6 +16,7 @@
* reserved.
* Copyright (c) 2016 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -24,6 +25,7 @@
*/
#include "ompi_config.h"
#include "opal/class/opal_bitmap.h"
#include "ompi/group/group.h"
#include "ompi/constants.h"
#include "ompi/proc/proc.h"

View File

@ -3,6 +3,7 @@
# Copyright (c) 2012 Los Alamos National Security, LLC. All rights reserved.
# Copyright (c) 2013 Sandia National Laboratories. All rights reserved.
#
# Copyright (c) 2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
@ -10,8 +11,9 @@
# $HEADER$
#
# Highest priority, as it's the default
AC_DEFUN([MCA_ompi_rte_orte_PRIORITY], [100])
# Lowest priority, as it's the default and we want
# it to be able to be overridden
AC_DEFUN([MCA_ompi_rte_orte_PRIORITY], [10])
# Force this component to compile in static-only mode
AC_DEFUN([MCA_ompi_rte_orte_COMPILE_MODE], [

View File

@ -67,13 +67,11 @@ typedef orte_ns_cmp_bitmask_t ompi_rte_cmp_bitmask_t;
#define OMPI_NAME ORTE_NAME
#define OMPI_PROCESS_NAME_HTON ORTE_PROCESS_NAME_HTON
#define OMPI_PROCESS_NAME_NTOH ORTE_PROCESS_NAME_NTOH
#define OMPI_RTE_MY_NODEID ORTE_PROC_MY_DAEMON->vpid
/* database keys */
#define OMPI_RTE_NODE_ID ORTE_DB_DAEMON_VPID
#define OMPI_RTE_HOST_ID ORTE_DB_HOSTID
#if OPAL_ENABLE_DEBUG
static inline orte_process_name_t * OMPI_CAST_RTE_NAME(opal_process_name_t * name);
static inline orte_process_name_t * OMPI_CAST_RTE_NAME(opal_process_name_t * name) {
return (orte_process_name_t *)name;
}
#else
#define OMPI_CAST_RTE_NAME(a) ((orte_process_name_t*)(a))
#endif
@ -95,30 +93,11 @@ OMPI_DECLSPEC void __opal_attribute_noreturn__
#define ompi_rte_finalize() orte_finalize()
OMPI_DECLSPEC void ompi_rte_wait_for_debugger(void);
typedef struct {
ompi_rte_component_t super;
opal_mutex_t lock;
opal_list_t modx_reqs;
} ompi_rte_orte_component_t;
typedef struct {
opal_list_item_t super;
opal_mutex_t lock;
opal_condition_t cond;
bool active;
orte_process_name_t peer;
} ompi_orte_tracker_t;
OBJ_CLASS_DECLARATION(ompi_orte_tracker_t);
#if OPAL_ENABLE_DEBUG
static inline orte_process_name_t * OMPI_CAST_RTE_NAME(opal_process_name_t * name) {
return (orte_process_name_t *)name;
}
#endif
/* check dynamics support */
OMPI_DECLSPEC bool ompi_rte_connect_accept_support(const char *port);
#define ompi_proc_applied_binding orte_proc_applied_binding
END_C_DECLS
#endif /* MCA_OMPI_RTE_ORTE_H */

View File

@ -1,7 +1,7 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights reserved.
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
* reserved.
*
@ -47,59 +47,34 @@ static int rte_orte_close(void);
* and pointers to our public functions in it
*/
ompi_rte_orte_component_t mca_rte_orte_component = {
{
/* First, the mca_component_t struct containing meta information
about the component itself */
ompi_rte_component_t mca_rte_orte_component = {
/* First, the mca_component_t struct containing meta information
about the component itself */
.base_version = {
OMPI_RTE_BASE_VERSION_1_0_0,
.base_version = {
OMPI_RTE_BASE_VERSION_1_0_0,
/* Component name and version */
.mca_component_name = "orte",
MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION,
OMPI_RELEASE_VERSION),
/* Component name and version */
.mca_component_name = "orte",
MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION,
OMPI_RELEASE_VERSION),
/* Component open and close functions */
.mca_open_component = rte_orte_open,
.mca_close_component = rte_orte_close,
},
.base_data = {
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
}
/* Component open and close functions */
.mca_open_component = rte_orte_open,
.mca_close_component = rte_orte_close,
},
.base_data = {
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
};
static int rte_orte_open(void)
{
OBJ_CONSTRUCT(&mca_rte_orte_component.lock, opal_mutex_t);
OBJ_CONSTRUCT(&mca_rte_orte_component.modx_reqs, opal_list_t);
return OMPI_SUCCESS;
}
static int rte_orte_close(void)
{
opal_mutex_lock(&mca_rte_orte_component.lock);
OPAL_LIST_DESTRUCT(&mca_rte_orte_component.modx_reqs);
opal_mutex_unlock(&mca_rte_orte_component.lock);
OBJ_DESTRUCT(&mca_rte_orte_component.lock);
return OMPI_SUCCESS;
}
static void con(ompi_orte_tracker_t *p)
{
p->active = true;
OBJ_CONSTRUCT(&p->lock, opal_mutex_t);
OBJ_CONSTRUCT(&p->cond, opal_condition_t);
}
static void des(ompi_orte_tracker_t *p)
{
OBJ_DESTRUCT(&p->lock);
OBJ_DESTRUCT(&p->cond);
}
OBJ_CLASS_INSTANCE(ompi_orte_tracker_t,
opal_list_item_t,
con, des);

View File

@ -51,7 +51,7 @@
#include "ompi/runtime/params.h"
#include "ompi/communicator/communicator.h"
extern ompi_rte_orte_component_t mca_rte_orte_component;
extern ompi_rte_component_t mca_rte_orte_component;
void ompi_rte_abort(int error_code, char *fmt, ...)
{

View File

@ -0,0 +1,29 @@
#
# Copyright (c) 2012 Los Alamos National Security, LLC.
# All rights reserved.
# Copyright (c) 2014 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2016-2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
headers = rte_pmix.h
sources = \
rte_pmix_component.c \
rte_pmix_module.c
# Conditionally install the header files
if WANT_INSTALL_HEADERS
ompidir = $(ompiincludedir)/$(subdir)
nobase_ompi_HEADERS = $(headers)
endif
# We only ever build this component statically
noinst_LTLIBRARIES = libmca_rte_pmix.la
libmca_rte_pmix_la_SOURCES =$(sources) $(headers)
libmca_rte_pmix_la_LDFLAGS = -module -avoid-version
libmca_rte_pmix_la_LIBADD = $(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la

View File

@ -0,0 +1,45 @@
# -*- shell-script -*-
#
# Copyright (c) 2012 Los Alamos National Security, LLC. All rights reserved.
# Copyright (c) 2013 Sandia National Laboratories. All rights reserved.
#
# Copyright (c) 2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# Higher priority to override the default
AC_DEFUN([MCA_ompi_rte_pmix_PRIORITY], [50])
# Force this component to compile in static-only mode
AC_DEFUN([MCA_ompi_rte_pmix_COMPILE_MODE], [
AC_MSG_CHECKING([for MCA component $2:$3 compile mode])
$4="static"
AC_MSG_RESULT([$$4])
])
# If component was selected, $1 will be 1 and we should set the base header
AC_DEFUN([MCA_ompi_rte_pmix_POST_CONFIG],[
AS_IF([test "$1" = "1"], [ompi_rte_base_include="pmix/rte_pmix.h"])
AC_DEFINE_UNQUOTED([OMPI_RTE_PMIX], [$1],
[Defined to 1 if the OMPI runtime component is PMIX])
AM_CONDITIONAL([OMPI_RTE_PMIX], [test $1 = 1])
])dnl
# MCA_rte_pmix_CONFIG([action-if-can-compile],
# [action-if-cant-compile])
# ------------------------------------------------
AC_DEFUN([MCA_ompi_rte_pmix_CONFIG],[
AC_CONFIG_FILES([ompi/mca/rte/pmix/Makefile])
AC_ARG_WITH([ompi-pmix-rte],
AC_HELP_STRING([--with-ompi-pmix-rte],
[Use PMIx as the OMPI run-time environment (default: no)]))
AS_IF([test "$with_ompi_pmix_rte" == "yes"],
[$1
AC_MSG_NOTICE([PMIx RTE selected by user])],
[$2])
])

View File

@ -0,0 +1,136 @@
/*
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2014 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2014-2016 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2015 Intel, Inc. All rights reserved.
* Copyright (c) 2016-2017 IBM Corporation. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
* When this component is used, this file is included in the rest of
* the OPAL/OMPI code base via ompi/mca/rte/rte.h. As such,
* this header represents the public interface to this static component.
*/
#ifndef MCA_OMPI_RTE_PMIX_H
#define MCA_OMPI_RTE_PMIX_H
#include "ompi_config.h"
#include "ompi/constants.h"
#include <stdint.h>
#ifdef HAVE_SYS_TYPES_H
#include <sys/types.h>
#endif
struct opal_proc_t;
#include "opal/threads/threads.h"
#include "opal/util/proc.h"
#include "opal/mca/hwloc/hwloc-internal.h"
#include "opal/mca/pmix/pmix.h"
struct ompi_proc_t;
struct ompi_communicator_t;
BEGIN_C_DECLS
/* Process name objects and operations */
typedef opal_process_name_t ompi_process_name_t;
typedef uint32_t ompi_jobid_t;
typedef uint32_t ompi_vpid_t;
/* some local storage */
OMPI_DECLSPEC extern opal_process_name_t pmix_name_wildcard;
OMPI_DECLSPEC extern opal_process_name_t pmix_proc_my_name;
OMPI_DECLSPEC extern hwloc_cpuset_t ompi_proc_applied_binding;
#define OMPI_PROC_MY_NAME (&pmix_proc_my_name)
#define OMPI_NAME_WILDCARD (&pmix_name_wildcard)
typedef uint8_t ompi_rte_cmp_bitmask_t;
#define OMPI_RTE_CMP_NONE 0x00
#define OMPI_RTE_CMP_JOBID 0x02
#define OMPI_RTE_CMP_VPID 0x04
#define OMPI_RTE_CMP_ALL 0x04
#define OMPI_RTE_CMP_WILD 0x10
#define OMPI_NAME_PRINT(a) OPAL_NAME_PRINT((*(a)))
OMPI_DECLSPEC int ompi_rte_compare_name_fields(ompi_rte_cmp_bitmask_t mask,
const opal_process_name_t* name1,
const opal_process_name_t* name2);
OMPI_DECLSPEC int ompi_rte_convert_string_to_process_name(opal_process_name_t *name,
const char* name_string);
OMPI_DECLSPEC int ompi_rte_convert_process_name_to_string(char** name_string,
const opal_process_name_t *name);
#define OMPI_LOCAL_JOBID(jobid) jobid
#define OMPI_JOB_FAMILY(jobid) 0
/* do a little with the "family" param to avoid compiler warnings */
#define OMPI_CONSTRUCT_JOBID(family,local) \
((family & 0x0000) | local)
/* This is the DSS tag to serialize a proc name */
#define OMPI_NAME OPAL_NAME
#define OMPI_PROCESS_NAME_HTON OPAL_PROCESS_NAME_HTON
#define OMPI_PROCESS_NAME_NTOH OPAL_PROCESS_NAME_NTOH
#if OPAL_ENABLE_DEBUG
static inline opal_process_name_t * OMPI_CAST_RTE_NAME(opal_process_name_t * name) {
return (opal_process_name_t *)name;
}
#else
#define OMPI_CAST_RTE_NAME(a) ((opal_process_name_t*)(a))
#endif
/* Process info struct and values */
typedef uint16_t ompi_node_rank_t;
typedef uint16_t ompi_local_rank_t;
#define OMPI_NODE_RANK_INVALID UINT16_MAX
#define OMPI_LOCAL_RANK_INVALID UINT16_MAX
typedef struct {
opal_process_name_t my_name;
char *my_hnp_uri;
char *nodename;
pid_t pid;
char *job_session_dir;
char *proc_session_dir;
uint16_t my_local_rank;
uint16_t my_node_rank;
int32_t num_local_peers;
uint32_t num_procs;
uint32_t app_num;
} pmix_process_info_t;
OMPI_DECLSPEC extern pmix_process_info_t pmix_process_info;
#define ompi_process_info pmix_process_info
OMPI_DECLSPEC extern bool pmix_proc_is_bound;
#define ompi_rte_proc_is_bound pmix_proc_is_bound
/* Error handling objects and operations */
OMPI_DECLSPEC void __opal_attribute_noreturn__
ompi_rte_abort(int error_code, char *fmt, ...);
OMPI_DECLSPEC void ompi_rte_abort_peers(opal_process_name_t *procs,
int32_t num_procs,
int error_code);
#define OMPI_ERROR_LOG OPAL_ERROR_LOG
/* Init and finalize operations */
OMPI_DECLSPEC int ompi_rte_init(int *argc, char ***argv);
OMPI_DECLSPEC int ompi_rte_finalize(void);
OMPI_DECLSPEC void ompi_rte_wait_for_debugger(void);
/* check dynamics support */
OMPI_DECLSPEC bool ompi_rte_connect_accept_support(const char *port);
END_C_DECLS
#endif /* MCA_OMPI_RTE_PMIX_H */

View File

@ -0,0 +1,77 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights reserved.
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
* reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
* These symbols are in a file by themselves to provide nice linker
* semantics. Since linkers generally pull in symbols by object
* files, keeping these symbols as the only symbols in this file
* prevents utility programs such as "ompi_info" from having to import
* entire components just to query their version and parameters.
*/
#include "ompi_config.h"
#include "ompi/constants.h"
#include "opal/threads/threads.h"
#include "opal/class/opal_list.h"
#include "ompi/mca/rte/rte.h"
#include "rte_pmix.h"
/*
* Public string showing the component version number
*/
const char *ompi_rte_pmix_component_version_string =
"OMPI pmix rte MCA component version " OMPI_VERSION;
/*
* Local function
*/
static int rte_pmix_open(void);
static int rte_pmix_close(void);
/*
* Instantiate the public struct with all of our public information
* and pointers to our public functions in it
*/
ompi_rte_component_t mca_rte_pmix_component = {
/* First, the mca_component_t struct containing meta information
about the component itself */
.base_version = {
OMPI_RTE_BASE_VERSION_1_0_0,
/* Component name and version */
.mca_component_name = "pmix",
MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION,
OMPI_RELEASE_VERSION),
/* Component open and close functions */
.mca_open_component = rte_pmix_open,
.mca_close_component = rte_pmix_close,
},
.base_data = {
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
};
static int rte_pmix_open(void)
{
return OMPI_SUCCESS;
}
static int rte_pmix_close(void)
{
return OMPI_SUCCESS;
}

View File

@ -0,0 +1,759 @@
/*
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2012-2014 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2014 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*/
#include "ompi_config.h"
#include "ompi/constants.h"
#include <string.h>
#include <stdio.h>
#include <ctype.h>
#ifdef HAVE_SYS_TYPES_H
#include <sys/types.h>
#endif /* HAVE_SYS_TYPES_H */
#include <sys/stat.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_DIRENT_H
#include <dirent.h>
#endif /* HAVE_DIRENT_H */
#ifdef HAVE_PWD_H
#include <pwd.h>
#endif /* HAVE_PWD_H */
#include "opal/dss/dss.h"
#include "opal/util/argv.h"
#include "opal/util/error.h"
#include "opal/util/opal_getcwd.h"
#include "opal/util/os_path.h"
#include "opal/util/os_dirpath.h"
#include "opal/util/proc.h"
#include "opal/util/show_help.h"
#include "opal/mca/hwloc/base/base.h"
#include "opal/mca/pmix/base/base.h"
#include "opal/threads/threads.h"
#include "opal/class/opal_list.h"
#include "opal/dss/dss.h"
#include "ompi/mca/rte/base/base.h"
#include "ompi/mca/rte/rte.h"
#include "ompi/debuggers/debuggers.h"
#include "ompi/proc/proc.h"
#include "ompi/runtime/params.h"
#include "ompi/communicator/communicator.h"
/* instantiate a debugger-required value */
volatile int MPIR_being_debugged = 0;
extern ompi_rte_component_t mca_rte_pmix_component;
/* storage to support OMPI */
opal_process_name_t pmix_name_wildcard = {UINT32_MAX-1, UINT32_MAX-1};
opal_process_name_t pmix_name_invalid = {UINT32_MAX, UINT32_MAX};
opal_process_name_t pmix_proc_my_name = {0, 0};
hwloc_cpuset_t ompi_proc_applied_binding = NULL;
pmix_process_info_t pmix_process_info = {0};
bool pmix_proc_is_bound = false;
static bool pmix_in_parallel_debugger = false;
static bool added_transport_keys = false;
static bool added_num_procs = false;
static bool added_app_ctx = false;
static char* pre_condition_transports_print(uint64_t *unique_key);
static int _setup_job_session_dir(char **sdir);
#define ORTE_SCHEMA_DELIMITER_CHAR '.'
#define ORTE_SCHEMA_WILDCARD_CHAR '*'
#define ORTE_SCHEMA_WILDCARD_STRING "*"
#define ORTE_SCHEMA_INVALID_CHAR '$'
#define ORTE_SCHEMA_INVALID_STRING "$"
int ompi_rte_compare_name_fields(ompi_rte_cmp_bitmask_t fields,
const opal_process_name_t* name1,
const opal_process_name_t* name2)
{
/* handle the NULL pointer case */
if (NULL == name1 && NULL == name2) {
return OPAL_EQUAL;
} else if (NULL == name1) {
return OPAL_VALUE2_GREATER;
} else if (NULL == name2) {
return OPAL_VALUE1_GREATER;
}
/* in this comparison function, we check for exact equalities.
* In the case of wildcards, we check to ensure that the fields
* actually match those values - thus, a "wildcard" in this
* function does not actually stand for a wildcard value, but
* rather a specific value - UNLESS the CMP_WILD bitmask value
* is set
*/
/* check job id */
if (OMPI_RTE_CMP_JOBID & fields) {
if (OMPI_RTE_CMP_WILD & fields &&
(pmix_name_wildcard.jobid == name1->jobid ||
pmix_name_wildcard.jobid == name2->jobid)) {
goto check_vpid;
}
if (name1->jobid < name2->jobid) {
return OPAL_VALUE2_GREATER;
} else if (name1->jobid > name2->jobid) {
return OPAL_VALUE1_GREATER;
}
}
/* get here if jobid's are equal, or not being checked
* now check vpid
*/
check_vpid:
if (OMPI_RTE_CMP_VPID & fields) {
if (OMPI_RTE_CMP_WILD & fields &&
(pmix_name_wildcard.vpid == name1->vpid ||
pmix_name_wildcard.vpid == name2->vpid)) {
return OPAL_EQUAL;
}
if (name1->vpid < name2->vpid) {
return OPAL_VALUE2_GREATER;
} else if (name1->vpid > name2->vpid) {
return OPAL_VALUE1_GREATER;
}
}
/* only way to get here is if all fields are being checked and are equal,
* or jobid not checked, but vpid equal,
* only vpid being checked, and equal
* return that fact
*/
return OPAL_EQUAL;
}
int ompi_rte_convert_string_to_process_name(opal_process_name_t *name,
const char* name_string)
{
char *temp, *token;
opal_jobid_t job;
opal_vpid_t vpid;
int return_code=OPAL_SUCCESS;
/* set default */
name->jobid = pmix_name_invalid.jobid;
name->vpid = pmix_name_invalid.vpid;
/* check for NULL string - error */
if (NULL == name_string) {
OPAL_ERROR_LOG(OPAL_ERR_BAD_PARAM);
return OPAL_ERR_BAD_PARAM;
}
temp = strdup(name_string); /** copy input string as the strtok process is destructive */
token = strchr(temp, ORTE_SCHEMA_DELIMITER_CHAR); /** get first field -> jobid */
/* check for error */
if (NULL == token) {
OPAL_ERROR_LOG(OPAL_ERR_BAD_PARAM);
free(temp);
return OPAL_ERR_BAD_PARAM;
}
*token = '\0';
token++;
/* check for WILDCARD character - assign
* value accordingly, if found
*/
if (0 == strcmp(temp, ORTE_SCHEMA_WILDCARD_STRING)) {
job = pmix_name_wildcard.jobid;
} else if (0 == strcmp(temp, ORTE_SCHEMA_INVALID_STRING)) {
job = pmix_name_invalid.jobid;
} else {
job = strtoul(temp, NULL, 10);
}
/* check for WILDCARD character - assign
* value accordingly, if found
*/
if (0 == strcmp(token, ORTE_SCHEMA_WILDCARD_STRING)) {
vpid = pmix_name_wildcard.vpid;
} else if (0 == strcmp(token, ORTE_SCHEMA_INVALID_STRING)) {
vpid = pmix_name_invalid.vpid;
} else {
vpid = strtoul(token, NULL, 10);
}
name->jobid = job;
name->vpid = vpid;
free(temp);
return return_code;
}
int ompi_rte_convert_process_name_to_string(char** name_string,
const opal_process_name_t *name)
{
char *tmp, *tmp2;
if (NULL == name) { /* got an error */
OPAL_ERROR_LOG(OPAL_ERR_BAD_PARAM);
return OPAL_ERR_BAD_PARAM;
}
/* check for wildcard and invalid values - where encountered, insert the
* corresponding string so we can correctly parse the name string when
* it is passed back to us later
*/
if (pmix_name_wildcard.jobid == name->jobid) {
asprintf(&tmp, "%s", ORTE_SCHEMA_WILDCARD_STRING);
} else if (pmix_name_invalid.jobid == name->jobid) {
asprintf(&tmp, "%s", ORTE_SCHEMA_INVALID_STRING);
} else {
asprintf(&tmp, "%lu", (unsigned long)name->jobid);
}
if (pmix_name_wildcard.vpid == name->vpid) {
asprintf(&tmp2, "%s%c%s", tmp, ORTE_SCHEMA_DELIMITER_CHAR, ORTE_SCHEMA_WILDCARD_STRING);
} else if (pmix_name_invalid.vpid == name->vpid) {
asprintf(&tmp2, "%s%c%s", tmp, ORTE_SCHEMA_DELIMITER_CHAR, ORTE_SCHEMA_INVALID_STRING);
} else {
asprintf(&tmp2, "%s%c%lu", tmp, ORTE_SCHEMA_DELIMITER_CHAR, (unsigned long)name->vpid);
}
asprintf(name_string, "%s", tmp2);
free(tmp);
free(tmp2);
return OPAL_SUCCESS;
}
int ompi_rte_init(int *pargc, char ***pargv)
{
int ret;
char *error = NULL;
opal_process_name_t pname;
opal_proc_t *myname;
int u32, *u32ptr;
uint16_t u16, *u16ptr;
char **peers=NULL, *mycpuset;
char *envar, *ev1, *ev2;
opal_value_t *kv;
char *val;
size_t i;
uint64_t unique_key[2];
char *string_key;
u32ptr = &u32;
u16ptr = &u16;
memset(&pmix_process_info, 0, sizeof(pmix_process_info));
/* initialize the opal layer */
if (OPAL_SUCCESS != (ret = opal_init(pargc, pargv))) {
error = "opal_init";
goto error;
}
/* open and setup pmix */
if (OPAL_SUCCESS != (ret = mca_base_framework_open(&opal_pmix_base_framework, 0))) {
OPAL_ERROR_LOG(ret);
/* we cannot run */
error = "pmix init";
goto error;
}
if (OPAL_SUCCESS != (ret = opal_pmix_base_select())) {
/* we cannot run */
error = "pmix init";
goto error;
}
/* set the event base */
opal_pmix_base_set_evbase(opal_sync_event_base);
/* initialize the selected module */
if (!opal_pmix.initialized() && (OPAL_SUCCESS != (ret = opal_pmix.init(NULL)))) {
/* we cannot run - this could be due to being direct launched
* without the required PMI support being built, so print
* out a help message indicating it */
opal_show_help("help-ompi-rte-pmix.txt", "no-pmi", true);
return OPAL_ERR_SILENT;
}
/* opal_pmix.init will have filled in proc name fields in
* OPAL, so transfer them here */
myname = opal_proc_local_get();
pmix_proc_my_name = myname->proc_name;
/* get our hostname */
pmix_process_info.nodename = opal_get_proc_hostname(myname);
/* get our local rank from PMI */
OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_RANK,
&pmix_proc_my_name, &u16ptr, OPAL_UINT16);
if (OPAL_SUCCESS != ret) {
error = "getting local rank";
goto error;
}
pmix_process_info.my_local_rank = u16;
/* get our node rank from PMI */
OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_NODE_RANK,
&pmix_proc_my_name, &u16ptr, OPAL_UINT16);
if (OPAL_SUCCESS != ret) {
error = "getting node rank";
goto error;
}
pmix_process_info.my_node_rank = u16;
/* get job size */
OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_JOB_SIZE,
&pmix_name_wildcard, &u32ptr, OPAL_UINT32);
if (OPAL_SUCCESS != ret) {
error = "getting job size";
goto error;
}
pmix_process_info.num_procs = u32;
/* push into the environ for pickup in MPI layer for
* MPI-3 required info key
*/
if (NULL == getenv(OPAL_MCA_PREFIX"orte_ess_num_procs")) {
asprintf(&ev1, OPAL_MCA_PREFIX"orte_ess_num_procs=%d", pmix_process_info.num_procs);
putenv(ev1);
added_num_procs = true;
}
if (NULL == getenv("OMPI_APP_CTX_NUM_PROCS")) {
asprintf(&ev2, "OMPI_APP_CTX_NUM_PROCS=%d", pmix_process_info.num_procs);
putenv(ev2);
added_app_ctx = true;
}
/* get our app number from PMI - ok if not found */
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_APPNUM,
&pmix_proc_my_name, &u32ptr, OPAL_UINT32);
if (OPAL_SUCCESS == ret) {
pmix_process_info.app_num = u32;
} else {
pmix_process_info.app_num = 0;
}
/* get the number of local peers - required for wireup of
* shared memory BTL */
OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_SIZE,
&pmix_name_wildcard, &u32ptr, OPAL_UINT32);
if (OPAL_SUCCESS == ret) {
pmix_process_info.num_local_peers = u32 - 1; // want number besides ourselves
} else {
pmix_process_info.num_local_peers = 0;
}
/* setup transport keys in case the MPI layer needs them -
* we can use the jobfam and stepid as unique keys
* because they are unique values assigned by the RM
*/
if (NULL == getenv(OPAL_MCA_PREFIX"orte_precondition_transports")) {
unique_key[0] = (pmix_proc_my_name.jobid & 0xff00) >> 16;
unique_key[1] = pmix_proc_my_name.jobid & 0x00ff;
if (NULL == (string_key = pre_condition_transports_print(unique_key))) {
OPAL_ERROR_LOG(OPAL_ERR_OUT_OF_RESOURCE);
return OPAL_ERR_OUT_OF_RESOURCE;
}
opal_output_verbose(2, ompi_rte_base_framework.framework_output,
"%s transport key %s",
OPAL_NAME_PRINT(pmix_proc_my_name), string_key);
asprintf(&envar, OPAL_MCA_PREFIX"orte_precondition_transports=%s", string_key);
putenv(envar);
added_transport_keys = true;
/* cannot free the envar as that messes up our environ */
free(string_key);
}
/* retrieve temp directories info */
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_NSDIR, &pmix_name_wildcard, &val, OPAL_STRING);
if (OPAL_SUCCESS == ret && NULL != val) {
pmix_process_info.job_session_dir = val;
val = NULL;
} else {
/* we need to create something */
ret = _setup_job_session_dir(&pmix_process_info.job_session_dir);
if (OPAL_SUCCESS != ret) {
error = "job session directory";
goto error;
}
}
/* get our local peers */
if (0 < pmix_process_info.num_local_peers) {
/* if my local rank if too high, then that's an error */
if (pmix_process_info.num_local_peers < pmix_process_info.my_local_rank) {
ret = OPAL_ERR_BAD_PARAM;
error = "num local peers";
goto error;
}
/* retrieve the local peers */
OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_PEERS,
&pmix_name_wildcard, &val, OPAL_STRING);
if (OPAL_SUCCESS == ret && NULL != val) {
peers = opal_argv_split(val, ',');
free(val);
} else {
peers = NULL;
}
} else {
peers = NULL;
}
/* set the locality */
if (NULL != peers) {
/* identify our location */
val = NULL;
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCALITY_STRING,
&pmix_proc_my_name, &val, OPAL_STRING);
if (OPAL_SUCCESS == ret && NULL != val) {
mycpuset = val;
} else {
mycpuset = NULL;
}
pname.jobid = pmix_proc_my_name.jobid;
for (i=0; NULL != peers[i]; i++) {
pname.vpid = strtoul(peers[i], NULL, 10);
if (pname.vpid == pmix_proc_my_name.vpid) {
/* we are fully local to ourselves */
u16 = OPAL_PROC_ALL_LOCAL;
} else {
val = NULL;
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCALITY_STRING,
&pname, &val, OPAL_STRING);
if (OPAL_SUCCESS == ret && NULL != val) {
u16 = opal_hwloc_compute_relative_locality(mycpuset, val);
free(val);
} else {
/* all we can say is that it shares our node */
u16 = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE;
}
}
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_LOCALITY);
kv->type = OPAL_UINT16;
OPAL_OUTPUT_VERBOSE((1, ompi_rte_base_framework.framework_output,
"%s locality: proc %s locality %s",
OPAL_NAME_PRINT(pmix_proc_my_name),
OPAL_NAME_PRINT(pname), opal_hwloc_base_print_locality(u16)));
kv->data.uint16 = u16;
ret = opal_pmix.store_local(&pname, kv);
if (OPAL_SUCCESS != ret) {
error = "local store of locality";
opal_argv_free(peers);
if (NULL != mycpuset) {
free(mycpuset);
}
goto error;
}
OBJ_RELEASE(kv);
}
opal_argv_free(peers);
if (NULL != mycpuset) {
free(mycpuset);
}
}
/* poor attempt to detect we are bound */
if (NULL != getenv("SLURM_CPU_BIND_TYPE")) {
pmix_proc_is_bound = true;
}
/* push our hostname so others can find us, if they need to - the
* native PMIx component will ignore this request as the hostname
* is provided by the system */
OPAL_MODEX_SEND_VALUE(ret, OPAL_PMIX_GLOBAL, OPAL_PMIX_HOSTNAME, pmix_process_info.nodename, OPAL_STRING);
if (OPAL_SUCCESS != ret) {
error = "db store hostname";
goto error;
}
return OPAL_SUCCESS;
error:
opal_show_help_finalize();
if (OPAL_ERR_SILENT != ret ) {
opal_show_help("help-ompi-rte-pmix.txt",
"internal-failure",
true, error, opal_strerror(ret), ret);
}
return ret;
}
static bool check_file(const char *root, const char *path)
{
struct stat st;
char *fullpath;
/*
* Keep:
* - non-zero files starting with "output-"
*/
if (0 == strncmp(path, "output-", strlen("output-"))) {
fullpath = opal_os_path(false, &fullpath, root, path, NULL);
stat(fullpath, &st);
free(fullpath);
if (0 == st.st_size) {
return true;
}
return false;
}
return true;
}
int ompi_rte_finalize(void)
{
/* remove the envars that we pushed into environ
* so we leave that structure intact
*/
if (added_transport_keys) {
unsetenv(OPAL_MCA_PREFIX"orte_precondition_transports");
}
if (added_num_procs) {
unsetenv(OPAL_MCA_PREFIX"orte_ess_num_procs");
}
if (added_app_ctx) {
unsetenv("OMPI_APP_CTX_NUM_PROCS");
}
/* shutdown pmix */
if (NULL != opal_pmix.finalize) {
opal_pmix.finalize();
(void) mca_base_framework_close(&opal_pmix_base_framework);
}
/* cleanup the session directory we created */
if (NULL != pmix_process_info.job_session_dir) {
opal_os_dirpath_destroy(pmix_process_info.job_session_dir,
false, check_file);
free(pmix_process_info.job_session_dir);
}
return OMPI_SUCCESS;
}
void ompi_rte_abort(int error_code, char *fmt, ...)
{
va_list arglist;
char* buffer = NULL;
struct timespec tp = {0, 100000};
/* If there was a message, output it */
va_start(arglist, fmt);
if( NULL != fmt ) {
vasprintf( &buffer, fmt, arglist );
}
va_end(arglist);
/* call abort */
opal_pmix.abort(error_code, buffer, NULL);
if (NULL != buffer) {
free(buffer);
}
/* provide a little delay for the PMIx thread to
* get the info out */
nanosleep(&tp, NULL);
/* Now Exit */
_exit(error_code);
}
void ompi_rte_abort_peers(opal_process_name_t *procs,
int32_t num_procs,
int error_code)
{
return;
}
static size_t handler = SIZE_MAX;
static bool debugger_register_active = true;
static bool debugger_event_active = true;
static void _release_fn(int status,
const opal_process_name_t *source,
opal_list_t *info, opal_list_t *results,
opal_pmix_notification_complete_fn_t cbfunc,
void *cbdata)
{
/* must let the notifier know we are done */
if (NULL != cbfunc) {
cbfunc(OPAL_SUCCESS, NULL, NULL, NULL, cbdata);
}
debugger_event_active = false;
}
static void _register_fn(int status,
size_t evhandler_ref,
void *cbdata)
{
opal_list_t *codes = (opal_list_t*)cbdata;
handler = evhandler_ref;
OPAL_LIST_RELEASE(codes);
debugger_register_active = false;
}
/*
* Wait for a debugger if asked. We support two ways of waiting for
* attaching debuggers -- see big comment in
* pmix/tools/pmixrun/debuggers.c explaining the two scenarios.
*/
void ompi_rte_wait_for_debugger(void)
{
int debugger;
opal_list_t *codes, directives;
opal_value_t *kv;
char *evar;
int time;
/* check PMIx to see if we are under a debugger */
debugger = pmix_in_parallel_debugger;
if (1 == MPIR_being_debugged) {
debugger = 1;
}
if (!debugger && NULL == getenv("PMIX_TEST_DEBUGGER_ATTACH")) {
/* if not, just return */
return;
}
/* if we are being debugged, then we need to find
* the correct plug-ins
*/
ompi_debugger_setup_dlls();
if (NULL != (evar = getenv("PMIX_TEST_DEBUGGER_SLEEP"))) {
time = strtol(evar, NULL, 10);
sleep(time);
return;
}
/* register an event handler for the PMIX_ERR_DEBUGGER_RELEASE event */
codes = OBJ_NEW(opal_list_t);
kv = OBJ_NEW(opal_value_t);
kv->key = strdup("errorcode");
kv->type = OPAL_INT;
kv->data.integer = OPAL_ERR_DEBUGGER_RELEASE;
opal_list_append(codes, &kv->super);
OBJ_CONSTRUCT(&directives, opal_list_t);
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_EVENT_HDLR_NAME);
kv->type = OPAL_STRING;
kv->data.string = strdup("MPI-DEBUGGER-ATTACH");
opal_list_append(&directives, &kv->super);
opal_pmix.register_evhandler(codes, &directives, _release_fn, _register_fn, codes);
/* let the MPI progress engine run while we wait for registration to complete */
OMPI_WAIT_FOR_COMPLETION(debugger_register_active);
OPAL_LIST_DESTRUCT(&directives);
/* let the MPI progress engine run while we wait for debugger release */
OMPI_WAIT_FOR_COMPLETION(debugger_event_active);
/* deregister the event handler */
opal_pmix.deregister_evhandler(handler, NULL, NULL);
}
bool ompi_rte_connect_accept_support(const char *port)
{
/* not sure how to support this yet */
return false;
}
static char* pre_condition_transports_print(uint64_t *unique_key)
{
unsigned int *int_ptr;
size_t i, j, string_key_len, written_len;
char *string_key = NULL, *format = NULL;
/* string is two 64 bit numbers printed in hex with a dash between
* and zero padding.
*/
string_key_len = (sizeof(uint64_t) * 2) * 2 + strlen("-") + 1;
string_key = (char*) malloc(string_key_len);
if (NULL == string_key) {
return NULL;
}
string_key[0] = '\0';
written_len = 0;
/* get a format string based on the length of an unsigned int. We
* want to have zero padding for sizeof(unsigned int) * 2
* characters -- when printing as a hex number, each byte is
* represented by 2 hex characters. Format will contain something
* that looks like %08lx, where the number 8 might be a different
* number if the system has a different sized long (8 would be for
* sizeof(int) == 4)).
*/
asprintf(&format, "%%0%dx", (int)(sizeof(unsigned int)) * 2);
/* print the first number */
int_ptr = (unsigned int*) &unique_key[0];
for (i = 0 ; i < sizeof(uint64_t) / sizeof(unsigned int) ; ++i) {
if (0 == int_ptr[i]) {
/* inject some energy */
for (j=0; j < sizeof(unsigned int); j++) {
int_ptr[i] |= j << j;
}
}
snprintf(string_key + written_len,
string_key_len - written_len,
format, int_ptr[i]);
written_len = strlen(string_key);
}
/* print the middle dash */
snprintf(string_key + written_len, string_key_len - written_len, "-");
written_len = strlen(string_key);
/* print the second number */
int_ptr = (unsigned int*) &unique_key[1];
for (i = 0 ; i < sizeof(uint64_t) / sizeof(unsigned int) ; ++i) {
if (0 == int_ptr[i]) {
/* inject some energy */
for (j=0; j < sizeof(unsigned int); j++) {
int_ptr[i] |= j << j;
}
}
snprintf(string_key + written_len,
string_key_len - written_len,
format, int_ptr[i]);
written_len = strlen(string_key);
}
free(format);
return string_key;
}
static int _setup_job_session_dir(char **sdir)
{
char *tmpdir;
/* get the effective uid */
uid_t uid = geteuid();
if( NULL == (tmpdir = getenv("TMPDIR")) )
if( NULL == (tmpdir = getenv("TEMP")) )
if( NULL == (tmpdir = getenv("TMP")) )
tmpdir = "/tmp";
if (0 > asprintf(&pmix_process_info.job_session_dir,
"%s/ompi.%s.%lu/jf.0/%u", tmpdir,
pmix_process_info.nodename,
(unsigned long)uid,
pmix_proc_my_name.jobid)) {
pmix_process_info.job_session_dir = NULL;
return OPAL_ERR_OUT_OF_RESOURCE;
}
return OPAL_SUCCESS;
}

View File

@ -5,7 +5,7 @@
* reserved.
* Copyright (c) 2011-2016 INRIA. All rights reserved.
* Copyright (c) 2012-2017 Bordeaux Polytechnic Institute
* Copyright (c) 2015-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2015-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2015-2017 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2016 Los Alamos National Security, LLC. All rights
@ -22,7 +22,7 @@
#include "ompi_config.h"
#include "opal/constants.h"
#include "opal/mca/hwloc/hwloc-internal.h"
#include "opal/mca/hwloc/base/base.h"
#include "ompi/mca/topo/treematch/topo_treematch.h"
#include "ompi/mca/topo/treematch/treematch/treematch.h"
@ -172,7 +172,7 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
OPAL_MODEX_RECV_VALUE(err, OPAL_PMIX_NODEID, &(proc->super.proc_name), &pval, OPAL_UINT32);
if( OPAL_SUCCESS != err ) {
opal_output(0, "Unable to extract peer %s nodeid from the modex.\n",
OMPI_NAME_PRINT(&(proc->super)));
OMPI_NAME_PRINT(&(proc->super.proc_name)));
colors[i] = -1;
continue;
}

View File

@ -8,7 +8,7 @@
* reserved.
* Copyright (c) 2015-2017 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2015 Intel, Inc. All rights reserved.
* Copyright (c) 2015-2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -34,6 +34,7 @@
#include "ompi/communicator/communicator.h"
#include "ompi/errhandler/errhandler.h"
#include "ompi/mca/rte/rte.h"
#include "ompi/mpi/c/bindings.h"
#include "ompi/mpiext/affinity/c/mpiext_affinity_c.h"
@ -104,12 +105,12 @@ static int get_rsrc_ompi_bound(char str[OMPI_AFFINITY_STRING_MAX])
return OMPI_SUCCESS;
}
if (NULL == orte_proc_applied_binding) {
if (NULL == ompi_proc_applied_binding) {
ret = OPAL_ERR_NOT_BOUND;
} else {
ret = opal_hwloc_base_cset2str(str, OMPI_AFFINITY_STRING_MAX,
opal_hwloc_topology,
orte_proc_applied_binding);
ompi_proc_applied_binding);
}
if (OPAL_ERR_NOT_BOUND == ret) {
strncpy(str, not_bound_str, OMPI_AFFINITY_STRING_MAX - 1);
@ -297,12 +298,12 @@ static int get_layout_ompi_bound(char str[OMPI_AFFINITY_STRING_MAX])
}
/* Find out what OMPI bound us to and prettyprint it */
if (NULL == orte_proc_applied_binding) {
if (NULL == ompi_proc_applied_binding) {
ret = OPAL_ERR_NOT_BOUND;
} else {
ret = opal_hwloc_base_cset2mapstr(str, OMPI_AFFINITY_STRING_MAX,
opal_hwloc_topology,
orte_proc_applied_binding);
ompi_proc_applied_binding);
}
if (OPAL_ERR_NOT_BOUND == ret) {
strncpy(str, not_bound_str, OMPI_AFFINITY_STRING_MAX - 1);