Коммит
1f00a1112c
@ -14,6 +14,7 @@
|
||||
* Copyright (c) 2012-2015 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2015 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* Copyright (c) 2016 Intel, Inc. All rights reserved
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -146,7 +147,7 @@ static int qsort_callback(const void *a, const void *b);
|
||||
int opal_cmd_line_create(opal_cmd_line_t *cmd,
|
||||
opal_cmd_line_init_t *table)
|
||||
{
|
||||
int i, ret = OPAL_SUCCESS;
|
||||
int ret = OPAL_SUCCESS;
|
||||
|
||||
/* Check bozo case */
|
||||
|
||||
@ -155,8 +156,17 @@ int opal_cmd_line_create(opal_cmd_line_t *cmd,
|
||||
}
|
||||
OBJ_CONSTRUCT(cmd, opal_cmd_line_t);
|
||||
|
||||
/* Ensure we got a table */
|
||||
ret = opal_cmd_line_add(cmd, table);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Add a table to an existing cmd line object */
|
||||
int opal_cmd_line_add(opal_cmd_line_t *cmd,
|
||||
opal_cmd_line_init_t *table)
|
||||
{
|
||||
int i, ret;
|
||||
|
||||
/* Ensure we got a table */
|
||||
if (NULL == table) {
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
@ -164,9 +174,7 @@ int opal_cmd_line_create(opal_cmd_line_t *cmd,
|
||||
/* Loop through the table */
|
||||
|
||||
for (i = 0; ; ++i) {
|
||||
|
||||
/* Is this the end? */
|
||||
|
||||
if ('\0' == table[i].ocl_cmd_short_name &&
|
||||
NULL == table[i].ocl_cmd_single_dash_name &&
|
||||
NULL == table[i].ocl_cmd_long_name) {
|
||||
@ -174,16 +182,14 @@ int opal_cmd_line_create(opal_cmd_line_t *cmd,
|
||||
}
|
||||
|
||||
/* Nope -- it's an entry. Process it. */
|
||||
|
||||
ret = make_opt(cmd, &table[i]);
|
||||
if (OPAL_SUCCESS != ret) {
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* Append a command line entry to the previously constructed command line
|
||||
*/
|
||||
@ -965,8 +971,19 @@ static int make_opt(opal_cmd_line_t *cmd, opal_cmd_line_init_t *e)
|
||||
return OPAL_ERR_BAD_PARAM;
|
||||
}
|
||||
|
||||
/* Allocate and fill an option item */
|
||||
/* see if the option already exists */
|
||||
if (NULL != e->ocl_cmd_single_dash_name &&
|
||||
NULL != find_option(cmd, e->ocl_cmd_single_dash_name)) {
|
||||
opal_output(0, "Duplicate cmd line entry %s", e->ocl_cmd_single_dash_name);
|
||||
return OPAL_ERR_BAD_PARAM;
|
||||
}
|
||||
if (NULL != e->ocl_cmd_long_name &&
|
||||
NULL != find_option(cmd, e->ocl_cmd_long_name)) {
|
||||
opal_output(0, "Duplicate cmd line entry %s", e->ocl_cmd_long_name);
|
||||
return OPAL_ERR_BAD_PARAM;
|
||||
}
|
||||
|
||||
/* Allocate and fill an option item */
|
||||
option = OBJ_NEW(cmd_line_option_t);
|
||||
if (NULL == option) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
|
@ -10,7 +10,7 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2012 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2015 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2015-2016 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -268,6 +268,16 @@ BEGIN_C_DECLS
|
||||
OPAL_DECLSPEC int opal_cmd_line_create(opal_cmd_line_t *cmd,
|
||||
opal_cmd_line_init_t *table);
|
||||
|
||||
/* Add a table of opal_cmd_line_init_t instances
|
||||
* to an existing OPAL command line handle.
|
||||
*
|
||||
* Multiple calls to opal_cmd_line_add are permitted - each
|
||||
* subsequent call will simply append new options to the existing
|
||||
* handle. Note that any duplicates will return an error.
|
||||
*/
|
||||
OPAL_DECLSPEC int opal_cmd_line_add(opal_cmd_line_t *cmd,
|
||||
opal_cmd_line_init_t *table);
|
||||
|
||||
/**
|
||||
* Create a command line option.
|
||||
*
|
||||
|
21
orte/bindings/README
Обычный файл
21
orte/bindings/README
Обычный файл
@ -0,0 +1,21 @@
|
||||
Copyright (c) 2016 Intel, Inc. All rights reserved
|
||||
|
||||
$COPYRIGHT$
|
||||
|
||||
Additional copyrights may follow
|
||||
|
||||
$HEADER$
|
||||
|
||||
===========================================================================
|
||||
|
||||
This is where bindings of ORTE functions to alternative programming languages
|
||||
such as Python and C++ reside. Not every ORTE function has been provided with
|
||||
a wrapper - it is purely on an as-needed basis. However, there is no restriction
|
||||
on the number of wrappers that can exist, nor on what type of function is wrapped.
|
||||
|
||||
There is only one rule to observe: you can wrap a framework, but you cannot wrap a
|
||||
specific plugin within that framework. This constraint flows from the fact that
|
||||
plugins are only accessed via the framework interface - thus, there is no way to
|
||||
guarantee that a particular plugin will be the active selection.
|
||||
|
||||
|
49
orte/bindings/python/README
Обычный файл
49
orte/bindings/python/README
Обычный файл
@ -0,0 +1,49 @@
|
||||
===========================================================================
|
||||
CFFI based Python wrapper for ORTE
|
||||
===========================================================================
|
||||
|
||||
|
||||
Example
|
||||
-------
|
||||
|
||||
This example starts up a persistent DVM and then spawns some tasks using
|
||||
Python.
|
||||
|
||||
$ virtualenv ve
|
||||
$ source ve/bin/activate
|
||||
$ pip install orte-cffi
|
||||
$ orte-dvm --report-uri dvm_uri
|
||||
$ python examples/submit.py
|
||||
|
||||
|
||||
Create a distfile
|
||||
----------------------------------------
|
||||
|
||||
If you want to create a sdist file:
|
||||
|
||||
$ virtualenv ve
|
||||
$ source ve/bin/activate
|
||||
$ python setup.py sdist
|
||||
|
||||
|
||||
Uploading sdist to pypi
|
||||
-----------------------
|
||||
|
||||
Assuming you have admin privileges to the pypi package repository for this
|
||||
package, a new version can be uploaded using twine:
|
||||
|
||||
$ virtualenv ve
|
||||
$ source ve/bin/activate
|
||||
$ pip install twine
|
||||
$ twine upload dist/orte-cffi-`python setup.py --version`.tar.gz
|
||||
|
||||
|
||||
Building (for development purposes only)
|
||||
----------------------------------------
|
||||
|
||||
If you want to create a non-pip build:
|
||||
|
||||
$ virtualenv ve
|
||||
$ source ve/bin/activate
|
||||
$ pip install cffi
|
||||
$ python src/orte-cffi/build.py
|
68
orte/bindings/python/examples/submit.py
Исполняемый файл
68
orte/bindings/python/examples/submit.py
Исполняемый файл
@ -0,0 +1,68 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import os
|
||||
import time
|
||||
|
||||
from orte_cffi import ffi, lib
|
||||
|
||||
DVM_URI = "file:dvm_uri"
|
||||
|
||||
@ffi.def_extern()
|
||||
def launch_cb(task, jdata, status, cbdata):
|
||||
print "Task %d is started!" % task
|
||||
instance = task_instance_map[task]
|
||||
instance.myspawn -= 1
|
||||
|
||||
@ffi.def_extern()
|
||||
def finish_cb(task, jdata, status, cbdata):
|
||||
print "Task %d is completed with status %d!" % (task, status)
|
||||
instance = task_instance_map[task]
|
||||
instance.mywait -= 1
|
||||
del task_instance_map[task]
|
||||
|
||||
# Dictionary to find class instance from task id
|
||||
task_instance_map = {}
|
||||
|
||||
# Request to create a background asynchronous event loop
|
||||
os.putenv("OMPI_MCA_ess_tool_async_progress", "enabled")
|
||||
|
||||
class Submit():
|
||||
|
||||
mywait = 0
|
||||
myspawn = 0
|
||||
|
||||
def run(self):
|
||||
|
||||
argv_keepalive = [
|
||||
ffi.new("char[]", "submit"), # Will be stripped off by the library
|
||||
ffi.new("char[]", "--hnp"), ffi.new("char[]", DVM_URI),
|
||||
ffi.NULL, # Required
|
||||
]
|
||||
argv = ffi.new("char *[]", argv_keepalive)
|
||||
lib.orte_submit_init(3, argv, ffi.NULL)
|
||||
|
||||
index = ffi.new("int *")
|
||||
|
||||
for i in range(3):
|
||||
|
||||
argv_keepalive = [
|
||||
ffi.new("char[]", "RADICAL-Pilot"),
|
||||
ffi.new("char[]", "--np"), ffi.new("char[]", "1"),
|
||||
ffi.new("char[]", "false"),
|
||||
ffi.NULL, # Required
|
||||
]
|
||||
argv = ffi.new("char *[]", argv_keepalive)
|
||||
lib.orte_submit_job(argv, index, lib.launch_cb, ffi.NULL, lib.finish_cb, ffi.NULL)
|
||||
task = index[0]
|
||||
task_instance_map[task] = self
|
||||
self.mywait += 1
|
||||
self.myspawn += 1
|
||||
print "Task %d submitted!" % task
|
||||
|
||||
while self.myspawn > 0 or self.mywait > 0:
|
||||
time.sleep(0.1)
|
||||
|
||||
print("Done!")
|
||||
|
||||
rp = Submit()
|
||||
rp.run()
|
16
orte/bindings/python/setup.py
Обычный файл
16
orte/bindings/python/setup.py
Обычный файл
@ -0,0 +1,16 @@
|
||||
from setuptools import setup
|
||||
|
||||
setup(
|
||||
name = "orte-cffi",
|
||||
version = "0.4.0",
|
||||
author = "Mark Santcroos",
|
||||
author_email = "mark.santcroos@rutgers.edu",
|
||||
description = "CFFI-based Python wrapper for Open RTE",
|
||||
license = "New BSD",
|
||||
keywords = "mpi cffi",
|
||||
packages = ['src/orte-cffi'],
|
||||
url = "http://www.open-mpi.org",
|
||||
setup_requires = ["cffi>=1.5.0"],
|
||||
cffi_modules = ["src/orte-cffi/build.py:ffi"],
|
||||
install_requires = ["cffi>=1.5.0"],
|
||||
)
|
140
orte/bindings/python/src/orte-cffi/build.py
Обычный файл
140
orte/bindings/python/src/orte-cffi/build.py
Обычный файл
@ -0,0 +1,140 @@
|
||||
import subprocess
|
||||
import os
|
||||
|
||||
|
||||
##########################################################################
|
||||
# Helper functions #
|
||||
##########################################################################
|
||||
|
||||
|
||||
#
|
||||
#
|
||||
# Get a path value from ompi_info based on key
|
||||
#
|
||||
def ompi_info_path(key):
|
||||
|
||||
cmd = ['ompi_info', '--path', key, '--parseable']
|
||||
|
||||
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
stdout, stderr= p.communicate()
|
||||
|
||||
if p.returncode != 0:
|
||||
raise Exception(stderr)
|
||||
|
||||
p_str, l_str, path = stdout.split(':')
|
||||
if p_str.strip() != 'path':
|
||||
raise Exception('Parse error')
|
||||
if l_str.strip() != key:
|
||||
raise Exception('Parse error')
|
||||
|
||||
path = path.strip()
|
||||
|
||||
if not os.path.isdir(path):
|
||||
raise Exception('Path "%s" is not an existing directory' % path)
|
||||
|
||||
return path
|
||||
|
||||
|
||||
#
|
||||
# Get the pkgconfig directory assuming its '$libdir/pkgconfig'
|
||||
#
|
||||
def get_pkgconfig_dir():
|
||||
|
||||
libdir = ompi_info_path('libdir')
|
||||
|
||||
pkgdir = os.path.join(libdir, 'pkgconfig')
|
||||
if not os.path.isdir(pkgdir):
|
||||
raise Exception('Path "%s" is not an existing directory' % pkgdir)
|
||||
|
||||
return pkgdir
|
||||
|
||||
|
||||
#
|
||||
# Run pkgconfig to get include dirs and lib dirs.
|
||||
# Optionally allow to specify a variable to pkgconfig.
|
||||
#
|
||||
def pkgconfig(libname, variables=None):
|
||||
|
||||
cmd = ['pkg-config', '--cflags-only-I', '--libs-only-L', libname]
|
||||
|
||||
if variables:
|
||||
for k,v in variables.iteritems():
|
||||
cmd.append('--define-variable=%s=%s' % (k, v))
|
||||
|
||||
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
stdout, stderr= p.communicate()
|
||||
|
||||
if p.returncode != 0:
|
||||
raise Exception(stderr)
|
||||
|
||||
include_dirs = []
|
||||
library_dirs = []
|
||||
|
||||
for item in stdout.split():
|
||||
if item.startswith("-L"):
|
||||
library_dirs.append(item[2:])
|
||||
elif item.startswith("-I"):
|
||||
include_dirs.append(item[2:])
|
||||
|
||||
return {'include_dirs': include_dirs,
|
||||
'library_dirs': library_dirs}
|
||||
|
||||
|
||||
# Get the pkgconfigdir from orte_info and export to environment
|
||||
pkgconfig_dir = get_pkgconfig_dir()
|
||||
os.environ['PKG_CONFIG_PATH'] = pkgconfig_dir
|
||||
|
||||
# Get the pkgincludedir from ompi_info
|
||||
pkgincludedir = ompi_info_path('pkgincludedir')
|
||||
pkgcfg = pkgconfig('orte', variables={'pkgincludedir': pkgincludedir})
|
||||
|
||||
# Extract include directories and check for existince
|
||||
include_dirs = pkgcfg['include_dirs']
|
||||
if len(include_dirs) == 0:
|
||||
raise Exception("No include dirs found")
|
||||
|
||||
# Extract library directories and check for existince
|
||||
library_dirs = pkgcfg['library_dirs']
|
||||
if len(library_dirs) == 0:
|
||||
raise Exception("No library dirs found")
|
||||
|
||||
|
||||
##########################################################################
|
||||
# CFFI specifics #
|
||||
##########################################################################
|
||||
|
||||
|
||||
from cffi import FFI
|
||||
ffi = FFI()
|
||||
|
||||
ffi.set_source("orte_cffi", """
|
||||
#include "orte/orted/orted_submit.h"
|
||||
""",
|
||||
libraries=["open-rte"],
|
||||
include_dirs=include_dirs,
|
||||
library_dirs=library_dirs
|
||||
)
|
||||
|
||||
ffi.cdef("""
|
||||
/* Types */
|
||||
typedef ... orte_job_t;
|
||||
typedef ... opal_cmd_line_t;
|
||||
typedef void (*orte_submit_cbfunc_t)(int index, orte_job_t *jdata, int ret, void *cbdata);
|
||||
|
||||
/* Functions */
|
||||
int orte_submit_init(int argc, char *argv[], opal_cmd_line_t *opts);
|
||||
int orte_submit_job(char *cmd[], int *index,
|
||||
orte_submit_cbfunc_t launch_cb, void *launch_cbdata,
|
||||
orte_submit_cbfunc_t complete_cb, void *complete_cbdata);
|
||||
void orte_submit_finalize(void);
|
||||
int orte_submit_cancel(int index);
|
||||
int orte_submit_halt(void);
|
||||
|
||||
/* Callbacks */
|
||||
extern "Python" void launch_cb(int, orte_job_t *, int, void *);
|
||||
extern "Python" void finish_cb(int, orte_job_t *, int, void *);
|
||||
""")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
ffi.compile(verbose=True)
|
@ -146,7 +146,8 @@ enum {
|
||||
ORTE_ERR_DUPLICATE_MSG = (ORTE_ERR_BASE - 54),
|
||||
ORTE_ERR_OUT_OF_ORDER_MSG = (ORTE_ERR_BASE - 55),
|
||||
ORTE_ERR_OPEN_CHANNEL_DUPLICATE = (ORTE_ERR_BASE - 56),
|
||||
ORTE_ERR_FORCE_SELECT = (ORTE_ERR_BASE - 57)
|
||||
ORTE_ERR_FORCE_SELECT = (ORTE_ERR_BASE - 57),
|
||||
ORTE_ERR_JOB_CANCELLED = (ORTE_ERR_BASE - 58)
|
||||
};
|
||||
|
||||
#define ORTE_ERR_MAX (ORTE_ERR_BASE - 100)
|
||||
|
@ -88,7 +88,7 @@ static int default_hnp_close(void)
|
||||
|
||||
static int default_hnp_component_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
if( ORTE_PROC_IS_HNP ) {
|
||||
if (ORTE_PROC_IS_HNP && !ORTE_PROC_IS_MASTER) {
|
||||
/* we are the default HNP component */
|
||||
*priority = my_priority;
|
||||
*module = (mca_base_module_t *)&orte_errmgr_default_hnp_module;
|
||||
|
35
orte/mca/errmgr/dvm/Makefile.am
Обычный файл
35
orte/mca/errmgr/dvm/Makefile.am
Обычный файл
@ -0,0 +1,35 @@
|
||||
#
|
||||
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2016 Intel, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
sources = \
|
||||
errmgr_dvm.h \
|
||||
errmgr_dvm_component.c \
|
||||
errmgr_dvm.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if MCA_BUILD_orte_errmgr_dvm_DSO
|
||||
component_noinst =
|
||||
component_install = mca_errmgr_dvm.la
|
||||
else
|
||||
component_noinst = libmca_errmgr_dvm.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(ortelibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_errmgr_dvm_la_SOURCES = $(sources)
|
||||
mca_errmgr_dvm_la_LDFLAGS = -module -avoid-version
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_errmgr_dvm_la_SOURCES =$(sources)
|
||||
libmca_errmgr_dvm_la_LDFLAGS = -module -avoid-version
|
693
orte/mca/errmgr/dvm/errmgr_dvm.c
Обычный файл
693
orte/mca/errmgr/dvm/errmgr_dvm.c
Обычный файл
@ -0,0 +1,693 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2011 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2011 Oracle and/or all its affiliates. All rights reserved.
|
||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include <sys/types.h>
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif /* HAVE_UNISTD_H */
|
||||
#include <string.h>
|
||||
#ifdef HAVE_SYS_WAIT_H
|
||||
#include <sys/wait.h>
|
||||
#endif
|
||||
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/dss/dss.h"
|
||||
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/odls/odls.h"
|
||||
#include "orte/mca/odls/base/base.h"
|
||||
#include "orte/mca/odls/base/odls_private.h"
|
||||
#include "orte/mca/plm/base/plm_private.h"
|
||||
#include "orte/mca/plm/plm.h"
|
||||
#include "orte/mca/rmaps/rmaps_types.h"
|
||||
#include "orte/mca/routed/routed.h"
|
||||
#include "orte/mca/grpcomm/grpcomm.h"
|
||||
#include "orte/mca/ess/ess.h"
|
||||
#include "orte/mca/state/state.h"
|
||||
|
||||
#include "orte/util/error_strings.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/nidmap.h"
|
||||
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/runtime/orte_locks.h"
|
||||
#include "orte/runtime/orte_quit.h"
|
||||
#include "orte/runtime/data_type_support/orte_dt_support.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/errmgr/base/base.h"
|
||||
#include "orte/mca/errmgr/base/errmgr_private.h"
|
||||
|
||||
#include "errmgr_dvm.h"
|
||||
|
||||
static int init(void);
|
||||
static int finalize(void);
|
||||
|
||||
static int predicted_fault(opal_list_t *proc_list,
|
||||
opal_list_t *node_list,
|
||||
opal_list_t *suggested_map);
|
||||
|
||||
static int suggest_map_targets(orte_proc_t *proc,
|
||||
orte_node_t *oldnode,
|
||||
opal_list_t *node_list);
|
||||
|
||||
static int ft_event(int state);
|
||||
|
||||
|
||||
/******************
|
||||
* dvm module
|
||||
******************/
|
||||
orte_errmgr_base_module_t orte_errmgr_dvm_module = {
|
||||
init,
|
||||
finalize,
|
||||
orte_errmgr_base_log,
|
||||
orte_errmgr_base_abort,
|
||||
orte_errmgr_base_abort_peers,
|
||||
predicted_fault,
|
||||
suggest_map_targets,
|
||||
ft_event,
|
||||
orte_errmgr_base_register_migration_warning,
|
||||
NULL,
|
||||
orte_errmgr_base_execute_error_callbacks
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
static void job_errors(int fd, short args, void *cbdata);
|
||||
static void proc_errors(int fd, short args, void *cbdata);
|
||||
|
||||
static int init(void)
|
||||
{
|
||||
/* setup state machine to trap job errors */
|
||||
orte_state.add_job_state(ORTE_JOB_STATE_ERROR, job_errors, ORTE_ERROR_PRI);
|
||||
|
||||
/* set the lost connection state to run at MSG priority so
|
||||
* we can process any last messages from the proc
|
||||
*/
|
||||
orte_state.add_proc_state(ORTE_PROC_STATE_COMM_FAILED, proc_errors, ORTE_MSG_PRI);
|
||||
|
||||
/* setup state machine to trap proc errors */
|
||||
orte_state.add_proc_state(ORTE_PROC_STATE_ERROR, proc_errors, ORTE_ERROR_PRI);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int finalize(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static void _terminate_job(orte_jobid_t jobid)
|
||||
{
|
||||
opal_pointer_array_t procs;
|
||||
orte_proc_t pobj;
|
||||
|
||||
OBJ_CONSTRUCT(&procs, opal_pointer_array_t);
|
||||
opal_pointer_array_init(&procs, 1, 1, 1);
|
||||
OBJ_CONSTRUCT(&pobj, orte_proc_t);
|
||||
pobj.name.jobid = jobid;
|
||||
pobj.name.vpid = ORTE_VPID_WILDCARD;
|
||||
opal_pointer_array_add(&procs, &pobj);
|
||||
orte_plm.terminate_procs(&procs);
|
||||
OBJ_DESTRUCT(&procs);
|
||||
OBJ_DESTRUCT(&pobj);
|
||||
}
|
||||
|
||||
static void job_errors(int fd, short args, void *cbdata)
|
||||
{
|
||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||
orte_job_t *jdata;
|
||||
orte_job_state_t jobstate;
|
||||
orte_exit_code_t sts;
|
||||
orte_proc_t *aborted_proc;
|
||||
opal_buffer_t *answer;
|
||||
int32_t rc, ret;
|
||||
int room, *rmptr;
|
||||
|
||||
/*
|
||||
* if orte is trying to shutdown, just let it
|
||||
*/
|
||||
if (orte_finalizing) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* if the jdata is NULL, then we ignore it as this
|
||||
* is reporting an unrecoverable error
|
||||
*/
|
||||
if (NULL == caddy->jdata) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
||||
OBJ_RELEASE(caddy);
|
||||
return;
|
||||
}
|
||||
|
||||
/* update the state */
|
||||
jdata = caddy->jdata;
|
||||
jobstate = caddy->job_state;
|
||||
jdata->state = jobstate;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
|
||||
"%s errmgr:dvm: job %s reported state %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(jdata->jobid),
|
||||
orte_job_state_to_str(jobstate)));
|
||||
|
||||
if (ORTE_JOB_STATE_NEVER_LAUNCHED == jobstate ||
|
||||
ORTE_JOB_STATE_ALLOC_FAILED == jobstate ||
|
||||
ORTE_JOB_STATE_MAP_FAILED == jobstate ||
|
||||
ORTE_JOB_STATE_CANNOT_LAUNCH == jobstate) {
|
||||
/* disable routing as we may not have performed the daemon
|
||||
* wireup - e.g., in a managed environment, all the daemons
|
||||
* "phone home", but don't actually wireup into the routed
|
||||
* network until they receive the launch message
|
||||
*/
|
||||
orte_routing_is_enabled = false;
|
||||
jdata->num_terminated = jdata->num_procs;
|
||||
ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_TERMINATED);
|
||||
/* if it was a dynamic spawn, then we better tell them this didn't work */
|
||||
if (ORTE_JOBID_INVALID != jdata->originator.jobid) {
|
||||
rc = jobstate;
|
||||
answer = OBJ_NEW(opal_buffer_t);
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &rc, 1, OPAL_INT32))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(caddy);
|
||||
return;
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &jdata->jobid, 1, ORTE_JOBID))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(caddy);
|
||||
return;
|
||||
}
|
||||
/* pack the room number */
|
||||
rmptr = &room;
|
||||
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_ROOM_NUM, (void**)&rmptr, OPAL_INT)) {
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &room, 1, OPAL_INT))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(caddy);
|
||||
return;
|
||||
}
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
||||
"%s errmgr:dvm sending dyn error release of job %s to %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(jdata->jobid),
|
||||
ORTE_NAME_PRINT(&jdata->originator)));
|
||||
if (0 > (ret = orte_rml.send_buffer_nb(&jdata->originator, answer,
|
||||
ORTE_RML_TAG_LAUNCH_RESP,
|
||||
orte_rml_send_callback, NULL))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(answer);
|
||||
}
|
||||
}
|
||||
OBJ_RELEASE(caddy);
|
||||
return;
|
||||
}
|
||||
|
||||
if (ORTE_JOB_STATE_FAILED_TO_START == jobstate ||
|
||||
ORTE_JOB_STATE_FAILED_TO_LAUNCH == jobstate) {
|
||||
/* the job object for this job will have been NULL'd
|
||||
* in the array if the job was solely local. If it isn't
|
||||
* NULL, then we need to tell everyone else to die
|
||||
*/
|
||||
aborted_proc = NULL;
|
||||
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, (void**)&aborted_proc, OPAL_PTR)) {
|
||||
sts = aborted_proc->exit_code;
|
||||
if (ORTE_PROC_MY_NAME->jobid == jdata->jobid) {
|
||||
if (WIFSIGNALED(sts)) { /* died on signal */
|
||||
#ifdef WCOREDUMP
|
||||
if (WCOREDUMP(sts)) {
|
||||
orte_show_help("help-plm-base.txt", "daemon-died-signal-core", true,
|
||||
WTERMSIG(sts));
|
||||
sts = WTERMSIG(sts);
|
||||
} else {
|
||||
orte_show_help("help-plm-base.txt", "daemon-died-signal", true,
|
||||
WTERMSIG(sts));
|
||||
sts = WTERMSIG(sts);
|
||||
}
|
||||
#else
|
||||
orte_show_help("help-plm-base.txt", "daemon-died-signal", true,
|
||||
WTERMSIG(sts));
|
||||
sts = WTERMSIG(sts);
|
||||
#endif /* WCOREDUMP */
|
||||
} else {
|
||||
orte_show_help("help-plm-base.txt", "daemon-died-no-signal", true,
|
||||
WEXITSTATUS(sts));
|
||||
sts = WEXITSTATUS(sts);
|
||||
}
|
||||
}
|
||||
}
|
||||
/* if this is the daemon job, then we need to ensure we
|
||||
* output an error message indicating we couldn't launch the
|
||||
* daemons */
|
||||
if (jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
|
||||
orte_show_help("help-errmgr-base.txt", "failed-daemon-launch", true);
|
||||
}
|
||||
}
|
||||
|
||||
/* if the daemon job aborted and we haven't heard from everyone yet,
|
||||
* then this could well have been caused by a daemon not finding
|
||||
* a way back to us. In this case, output a message indicating a daemon
|
||||
* died without reporting. Otherwise, say nothing as we
|
||||
* likely already output an error message */
|
||||
if (ORTE_JOB_STATE_ABORTED == jobstate &&
|
||||
jdata->jobid == ORTE_PROC_MY_NAME->jobid &&
|
||||
jdata->num_procs != jdata->num_reported) {
|
||||
orte_show_help("help-errmgr-base.txt", "failed-daemon", true);
|
||||
}
|
||||
|
||||
OBJ_RELEASE(caddy);
|
||||
}
|
||||
|
||||
static void proc_errors(int fd, short args, void *cbdata)
|
||||
{
|
||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||
orte_job_t *jdata;
|
||||
orte_proc_t *pptr, *proct;
|
||||
orte_process_name_t *proc = &caddy->name;
|
||||
orte_proc_state_t state = caddy->proc_state;
|
||||
int i;
|
||||
int32_t i32, *i32ptr;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
|
||||
"%s errmgr:dvm: for proc %s state %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(proc),
|
||||
orte_proc_state_to_str(state)));
|
||||
|
||||
/*
|
||||
* if orte is trying to shutdown, just let it
|
||||
*/
|
||||
if (orte_finalizing) {
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* get the job object */
|
||||
if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) {
|
||||
/* could be a race condition */
|
||||
goto cleanup;
|
||||
}
|
||||
pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid);
|
||||
|
||||
/* we MUST handle a communication failure before doing anything else
|
||||
* as it requires some special care to avoid normal termination issues
|
||||
* for local application procs
|
||||
*/
|
||||
if (ORTE_PROC_STATE_COMM_FAILED == state) {
|
||||
/* is this to a daemon? */
|
||||
if (ORTE_PROC_MY_NAME->jobid != proc->jobid) {
|
||||
/* nope - ignore it */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
||||
"%s Comm failure to non-daemon proc - ignoring it",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
goto cleanup;
|
||||
}
|
||||
/* if this is my own connection, ignore it */
|
||||
if (ORTE_PROC_MY_NAME->vpid == proc->vpid) {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
||||
"%s Comm failure on my own connection - ignoring it",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
goto cleanup;
|
||||
}
|
||||
/* mark the daemon as gone */
|
||||
ORTE_FLAG_UNSET(pptr, ORTE_PROC_FLAG_ALIVE);
|
||||
/* if we have ordered orteds to terminate or abort
|
||||
* is in progress, record it */
|
||||
if (orte_orteds_term_ordered || orte_abnormal_term_ordered) {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
||||
"%s Comm failure: daemons terminating - recording daemon %s as gone",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc)));
|
||||
/* remove from dependent routes, if it is one */
|
||||
orte_routed.route_lost(proc);
|
||||
/* if all my routes and local children are gone, then terminate ourselves */
|
||||
if (0 == orte_routed.num_routes()) {
|
||||
for (i=0; i < orte_local_children->size; i++) {
|
||||
if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) &&
|
||||
ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_ALIVE) && proct->state < ORTE_PROC_STATE_UNTERMINATED) {
|
||||
/* at least one is still alive */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
||||
"%s Comm failure: at least one proc (%s) still alive",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proct->name)));
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
/* call our appropriate exit procedure */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
||||
"%s errmgr_dvm: all routes and children gone - ordering exit",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
|
||||
} else {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
||||
"%s Comm failure: %d routes remain alive",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(int)orte_routed.num_routes()));
|
||||
}
|
||||
goto cleanup;
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
||||
"%s Comm failure: daemon %s - aborting",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc)));
|
||||
/* record the first one to fail */
|
||||
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
|
||||
/* output an error message so the user knows what happened */
|
||||
orte_show_help("help-errmgr-base.txt", "node-died", true, pptr->node->name);
|
||||
/* mark the daemon job as failed */
|
||||
jdata->state = ORTE_JOB_STATE_COMM_FAILED;
|
||||
/* point to the lowest rank to cause the problem */
|
||||
orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
|
||||
/* retain the object so it doesn't get free'd */
|
||||
OBJ_RETAIN(pptr);
|
||||
ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
|
||||
/* update our exit code */
|
||||
ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
|
||||
/* just in case the exit code hadn't been set, do it here - this
|
||||
* won't override any reported exit code */
|
||||
ORTE_UPDATE_EXIT_STATUS(ORTE_ERR_COMM_FAILURE);
|
||||
}
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* update the proc state - can get multiple reports on a proc
|
||||
* depending on circumstances, so ensure we only do this once
|
||||
*/
|
||||
if (pptr->state < ORTE_PROC_STATE_TERMINATED) {
|
||||
pptr->state = state;
|
||||
}
|
||||
|
||||
/* if we were ordered to terminate, mark this proc as dead and see if
|
||||
* any of our routes or local children remain alive - if not, then
|
||||
* terminate ourselves. */
|
||||
if (orte_orteds_term_ordered) {
|
||||
for (i=0; i < orte_local_children->size; i++) {
|
||||
if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
|
||||
if (ORTE_FLAG_TEST(proct, ORTE_PROC_FLAG_ALIVE)) {
|
||||
goto keep_going;
|
||||
}
|
||||
}
|
||||
}
|
||||
/* if all my routes and children are gone, then terminate
|
||||
ourselves nicely (i.e., this is a normal termination) */
|
||||
if (0 == orte_routed.num_routes()) {
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
|
||||
"%s errmgr:default:dvm all routes gone - exiting",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
|
||||
}
|
||||
}
|
||||
|
||||
keep_going:
|
||||
/* ensure we record the failed proc properly so we can report
|
||||
* the error once we terminate
|
||||
*/
|
||||
switch (state) {
|
||||
case ORTE_PROC_STATE_KILLED_BY_CMD:
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
||||
"%s errmgr:dvm: proc %s killed by cmd",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(proc)));
|
||||
/* we ordered this proc to die, so it isn't an abnormal termination
|
||||
* and we don't flag it as such
|
||||
*/
|
||||
if (jdata->num_terminated >= jdata->num_procs) {
|
||||
/* this job has terminated */
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
|
||||
}
|
||||
/* don't abort the job as this isn't an abnormal termination */
|
||||
break;
|
||||
|
||||
case ORTE_PROC_STATE_ABORTED:
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
||||
"%s errmgr:dvm: proc %s aborted",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(proc)));
|
||||
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
|
||||
jdata->state = ORTE_JOB_STATE_ABORTED;
|
||||
/* point to the first rank to cause the problem */
|
||||
orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
|
||||
/* retain the object so it doesn't get free'd */
|
||||
OBJ_RETAIN(pptr);
|
||||
ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
|
||||
ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
|
||||
/* kill the job */
|
||||
_terminate_job(jdata->jobid);
|
||||
}
|
||||
break;
|
||||
|
||||
case ORTE_PROC_STATE_ABORTED_BY_SIG:
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
||||
"%s errmgr:dvm: proc %s aborted by signal",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(proc)));
|
||||
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
|
||||
jdata->state = ORTE_JOB_STATE_ABORTED_BY_SIG;
|
||||
/* point to the first rank to cause the problem */
|
||||
orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
|
||||
/* retain the object so it doesn't get free'd */
|
||||
OBJ_RETAIN(pptr);
|
||||
ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
|
||||
ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
|
||||
/* kill the job */
|
||||
_terminate_job(jdata->jobid);
|
||||
}
|
||||
break;
|
||||
|
||||
case ORTE_PROC_STATE_TERM_WO_SYNC:
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
||||
"%s errmgr:dvm: proc %s terminated without sync",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(proc)));
|
||||
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
|
||||
jdata->state = ORTE_JOB_STATE_ABORTED_WO_SYNC;
|
||||
/* point to the first rank to cause the problem */
|
||||
orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
|
||||
/* retain the object so it doesn't get free'd */
|
||||
OBJ_RETAIN(pptr);
|
||||
ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
|
||||
ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
|
||||
/* now treat a special case - if the proc exit'd without a required
|
||||
* sync, it may have done so with a zero exit code. We want to ensure
|
||||
* that the user realizes there was an error, so in this -one- case,
|
||||
* we overwrite the process' exit code with the default error code
|
||||
*/
|
||||
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
/* kill the job */
|
||||
_terminate_job(jdata->jobid);
|
||||
}
|
||||
break;
|
||||
|
||||
case ORTE_PROC_STATE_FAILED_TO_START:
|
||||
case ORTE_PROC_STATE_FAILED_TO_LAUNCH:
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
||||
"%s errmgr:dvm: proc %s %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(proc),
|
||||
orte_proc_state_to_str(state)));
|
||||
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
|
||||
opal_buffer_t *answer;
|
||||
int id, *idptr, ret;
|
||||
|
||||
if (ORTE_PROC_STATE_FAILED_TO_START) {
|
||||
jdata->state = ORTE_JOB_STATE_FAILED_TO_START;
|
||||
} else {
|
||||
jdata->state = ORTE_JOB_STATE_FAILED_TO_LAUNCH;
|
||||
}
|
||||
/* point to the first rank to cause the problem */
|
||||
orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
|
||||
/* retain the object so it doesn't get free'd */
|
||||
OBJ_RETAIN(pptr);
|
||||
ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
|
||||
/* send a notification to the requestor - indicate that this is a spawn response */
|
||||
answer = OBJ_NEW(opal_buffer_t);
|
||||
/* pack the return status */
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &pptr->exit_code, 1, OPAL_INT32))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(answer);
|
||||
goto CLEANUP;
|
||||
}
|
||||
/* pack the jobid to be returned */
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &jdata->jobid, 1, ORTE_JOBID))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(answer);
|
||||
goto CLEANUP;
|
||||
}
|
||||
idptr = &id;
|
||||
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_ROOM_NUM, (void**)&idptr, OPAL_INT)) {
|
||||
/* pack the sender's index to the tracking object */
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, idptr, 1, OPAL_INT))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(answer);
|
||||
goto CLEANUP;
|
||||
}
|
||||
}
|
||||
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_FIXED_DVM, NULL, OPAL_BOOL)) {
|
||||
/* we need to send the requestor more info about what happened */
|
||||
opal_dss.pack(answer, &jdata->state, 1, ORTE_JOB_STATE_T);
|
||||
opal_dss.pack(answer, &pptr, 1, ORTE_PROC);
|
||||
opal_dss.pack(answer, &pptr->node, 1, ORTE_NODE);
|
||||
}
|
||||
/* return response */
|
||||
if (0 > (ret = orte_rml.send_buffer_nb(&jdata->originator, answer,
|
||||
ORTE_RML_TAG_LAUNCH_RESP,
|
||||
orte_rml_send_callback, NULL))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(answer);
|
||||
}
|
||||
/* record that we notified about this job */
|
||||
jdata->state = ORTE_JOB_STATE_NOTIFIED;
|
||||
CLEANUP:
|
||||
/* kill the job */
|
||||
_terminate_job(jdata->jobid);
|
||||
}
|
||||
/* if this was a daemon, report it */
|
||||
if (jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
|
||||
/* output a message indicating we failed to launch a daemon */
|
||||
orte_show_help("help-errmgr-base.txt", "failed-daemon-launch", true);
|
||||
}
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
|
||||
break;
|
||||
|
||||
case ORTE_PROC_STATE_CALLED_ABORT:
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
||||
"%s errmgr:dvm: proc %s called abort with exit code %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(proc), pptr->exit_code));
|
||||
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
|
||||
jdata->state = ORTE_JOB_STATE_CALLED_ABORT;
|
||||
/* point to the first proc to cause the problem */
|
||||
orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
|
||||
/* retain the object so it doesn't get free'd */
|
||||
OBJ_RETAIN(pptr);
|
||||
ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
|
||||
ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
|
||||
/* kill the job */
|
||||
_terminate_job(jdata->jobid);
|
||||
}
|
||||
break;
|
||||
|
||||
case ORTE_PROC_STATE_TERM_NON_ZERO:
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
||||
"%s errmgr:dvm: proc %s exited with non-zero status %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(proc),
|
||||
pptr->exit_code));
|
||||
ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
|
||||
/* track the number of non-zero exits */
|
||||
i32 = 0;
|
||||
i32ptr = &i32;
|
||||
orte_get_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, (void**)&i32ptr, OPAL_INT32);
|
||||
++i32;
|
||||
orte_set_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, ORTE_ATTR_LOCAL, i32ptr, OPAL_INT32);
|
||||
if (orte_abort_non_zero_exit) {
|
||||
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
|
||||
jdata->state = ORTE_JOB_STATE_NON_ZERO_TERM;
|
||||
/* point to the first rank to cause the problem */
|
||||
orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
|
||||
/* retain the object so it doesn't get free'd */
|
||||
OBJ_RETAIN(pptr);
|
||||
ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
|
||||
/* kill the job */
|
||||
_terminate_job(jdata->jobid);
|
||||
}
|
||||
} else {
|
||||
/* user requested we consider this normal termination */
|
||||
if (jdata->num_terminated >= jdata->num_procs) {
|
||||
/* this job has terminated */
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case ORTE_PROC_STATE_HEARTBEAT_FAILED:
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
||||
"%s errmgr:dvm: proc %s heartbeat failed",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(proc)));
|
||||
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
|
||||
jdata->state = ORTE_JOB_STATE_HEARTBEAT_FAILED;
|
||||
/* point to the first rank to cause the problem */
|
||||
orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
|
||||
/* retain the object so it doesn't get free'd */
|
||||
OBJ_RETAIN(pptr);
|
||||
ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
|
||||
ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
|
||||
/* kill the job */
|
||||
_terminate_job(jdata->jobid);
|
||||
}
|
||||
/* remove from dependent routes, if it is one */
|
||||
orte_routed.route_lost(proc);
|
||||
break;
|
||||
|
||||
case ORTE_PROC_STATE_UNABLE_TO_SEND_MSG:
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
||||
"%s errmgr:dvm: unable to send message to proc %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(proc)));
|
||||
/* if this proc is one of my daemons, then we are truly
|
||||
* hosed - so just exit out
|
||||
*/
|
||||
if (ORTE_PROC_MY_NAME->jobid == proc->jobid) {
|
||||
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
|
||||
break;
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
/* shouldn't get this, but terminate job if required */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
||||
"%s errmgr:dvm: proc %s default error %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(proc),
|
||||
orte_proc_state_to_str(state)));
|
||||
if (jdata->num_terminated == jdata->num_procs) {
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
|
||||
}
|
||||
break;
|
||||
}
|
||||
/* if the waitpid fired, be sure to let the state machine know */
|
||||
if (ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_WAITPID)) {
|
||||
ORTE_ACTIVATE_PROC_STATE(&pptr->name, ORTE_PROC_STATE_WAITPID_FIRED);
|
||||
}
|
||||
|
||||
cleanup:
|
||||
OBJ_RELEASE(caddy);
|
||||
}
|
||||
|
||||
static int predicted_fault(opal_list_t *proc_list,
|
||||
opal_list_t *node_list,
|
||||
opal_list_t *suggested_map)
|
||||
{
|
||||
return ORTE_ERR_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
static int suggest_map_targets(orte_proc_t *proc,
|
||||
orte_node_t *oldnode,
|
||||
opal_list_t *node_list)
|
||||
{
|
||||
return ORTE_ERR_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
static int ft_event(int state)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
39
orte/mca/errmgr/dvm/errmgr_dvm.h
Обычный файл
39
orte/mca/errmgr/dvm/errmgr_dvm.h
Обычный файл
@ -0,0 +1,39 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2016 Intel, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef MCA_ERRMGR_dvm_EXPORT_H
|
||||
#define MCA_ERRMGR_dvm_EXPORT_H
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
/*
|
||||
* Local Component structures
|
||||
*/
|
||||
|
||||
ORTE_MODULE_DECLSPEC extern orte_errmgr_base_component_t mca_errmgr_dvm_component;
|
||||
|
||||
ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr_dvm_module;
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* MCA_ERRMGR_dvm_EXPORT_H */
|
102
orte/mca/errmgr/dvm/errmgr_dvm_component.c
Обычный файл
102
orte/mca/errmgr/dvm/errmgr_dvm_component.c
Обычный файл
@ -0,0 +1,102 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2016 Intel, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "opal/util/output.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/errmgr/base/base.h"
|
||||
#include "orte/mca/errmgr/base/errmgr_private.h"
|
||||
#include "errmgr_dvm.h"
|
||||
|
||||
/*
|
||||
* Public string for version number
|
||||
*/
|
||||
const char *orte_errmgr_dvm_component_version_string =
|
||||
"ORTE ERRMGR dvm MCA component version " ORTE_VERSION;
|
||||
|
||||
/*
|
||||
* Local functionality
|
||||
*/
|
||||
static int dvm_register(void);
|
||||
static int dvm_open(void);
|
||||
static int dvm_close(void);
|
||||
static int dvm_component_query(mca_base_module_t **module, int *priority);
|
||||
|
||||
/*
|
||||
* Instantiate the public struct with all of our public information
|
||||
* and pointer to our public functions in it
|
||||
*/
|
||||
orte_errmgr_base_component_t mca_errmgr_dvm_component = {
|
||||
/* Handle the general mca_component_t struct containing
|
||||
* meta information about the component dvm
|
||||
*/
|
||||
.base_version = {
|
||||
ORTE_ERRMGR_BASE_VERSION_3_0_0,
|
||||
/* Component name and version */
|
||||
.mca_component_name = "dvm",
|
||||
MCA_BASE_MAKE_VERSION(component, ORTE_MAJOR_VERSION, ORTE_MINOR_VERSION,
|
||||
ORTE_RELEASE_VERSION),
|
||||
|
||||
/* Component open and close functions */
|
||||
.mca_open_component = dvm_open,
|
||||
.mca_close_component = dvm_close,
|
||||
.mca_query_component = dvm_component_query,
|
||||
.mca_register_component_params = dvm_register,
|
||||
},
|
||||
.base_data = {
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
},
|
||||
};
|
||||
|
||||
static int my_priority;
|
||||
|
||||
static int dvm_register(void)
|
||||
{
|
||||
mca_base_component_t *c = &mca_errmgr_dvm_component.base_version;
|
||||
|
||||
my_priority = 1000;
|
||||
(void) mca_base_component_var_register(c, "priority",
|
||||
"Priority of the dvm errmgr component",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY, &my_priority);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int dvm_open(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int dvm_close(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int dvm_component_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
/* used by DVM masters */
|
||||
if (ORTE_PROC_IS_MASTER) {
|
||||
*priority = my_priority;
|
||||
*module = (mca_base_module_t *)&orte_errmgr_dvm_module;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
*module = NULL;
|
||||
*priority = -1;
|
||||
return ORTE_ERROR;
|
||||
}
|
7
orte/mca/errmgr/dvm/owner.txt
Обычный файл
7
orte/mca/errmgr/dvm/owner.txt
Обычный файл
@ -0,0 +1,7 @@
|
||||
#
|
||||
# owner/status file
|
||||
# owner: institution that is responsible for this package
|
||||
# status: e.g. active, maintenance, unmaintained
|
||||
#
|
||||
owner: INTEL
|
||||
status: active
|
@ -299,6 +299,8 @@ static void xcast_recv(int status, orte_process_name_t* sender,
|
||||
* the initial message, minus the headers inserted by xcast itself */
|
||||
relay = OBJ_NEW(opal_buffer_t);
|
||||
opal_dss.copy_payload(relay, buffer);
|
||||
/* setup the relay list */
|
||||
OBJ_CONSTRUCT(&coll, opal_list_t);
|
||||
|
||||
/* if this is headed for the daemon command processor,
|
||||
* then we first need to check for add_local_procs
|
||||
@ -308,14 +310,8 @@ static void xcast_recv(int status, orte_process_name_t* sender,
|
||||
cnt=1;
|
||||
if (ORTE_SUCCESS == (ret = opal_dss.unpack(buffer, &command, &cnt, ORTE_DAEMON_CMD))) {
|
||||
/* if it is add_procs, then... */
|
||||
if (ORTE_DAEMON_ADD_LOCAL_PROCS == command) {
|
||||
OBJ_RELEASE(relay);
|
||||
relay = OBJ_NEW(opal_buffer_t);
|
||||
/* repack the command */
|
||||
if (OPAL_SUCCESS != (ret = opal_dss.pack(relay, &command, 1, ORTE_DAEMON_CMD))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
goto relay;
|
||||
}
|
||||
if (ORTE_DAEMON_ADD_LOCAL_PROCS == command ||
|
||||
ORTE_DAEMON_DVM_NIDMAP_CMD == command) {
|
||||
/* extract the byte object holding the daemonmap */
|
||||
cnt=1;
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &bo, &cnt, OPAL_BYTE_OBJECT))) {
|
||||
@ -354,11 +350,21 @@ static void xcast_recv(int status, orte_process_name_t* sender,
|
||||
ORTE_ERROR_LOG(ret);
|
||||
goto relay;
|
||||
}
|
||||
if (0 == flag) {
|
||||
/* copy the remainder of the payload */
|
||||
opal_dss.copy_payload(relay, buffer);
|
||||
/* no - just return */
|
||||
goto relay;
|
||||
|
||||
if (ORTE_DAEMON_ADD_LOCAL_PROCS == command) {
|
||||
OBJ_RELEASE(relay);
|
||||
relay = OBJ_NEW(opal_buffer_t);
|
||||
/* repack the command */
|
||||
if (OPAL_SUCCESS != (ret = opal_dss.pack(relay, &command, 1, ORTE_DAEMON_CMD))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
goto relay;
|
||||
}
|
||||
if (0 == flag) {
|
||||
/* copy the remainder of the payload */
|
||||
opal_dss.copy_payload(relay, buffer);
|
||||
/* no - just return */
|
||||
goto relay;
|
||||
}
|
||||
}
|
||||
|
||||
/* unpack the byte object */
|
||||
@ -381,8 +387,10 @@ static void xcast_recv(int status, orte_process_name_t* sender,
|
||||
OBJ_DESTRUCT(&wireup);
|
||||
}
|
||||
free(bo);
|
||||
/* copy the remainder of the payload */
|
||||
opal_dss.copy_payload(relay, buffer);
|
||||
if (ORTE_DAEMON_ADD_LOCAL_PROCS == command) {
|
||||
/* copy the remainder of the payload */
|
||||
opal_dss.copy_payload(relay, buffer);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
@ -391,8 +399,6 @@ static void xcast_recv(int status, orte_process_name_t* sender,
|
||||
}
|
||||
|
||||
relay:
|
||||
/* setup the relay list */
|
||||
OBJ_CONSTRUCT(&coll, opal_list_t);
|
||||
|
||||
/* get the list of next recipients from the routed module */
|
||||
orte_routed.get_routing_list(&coll);
|
||||
@ -420,18 +426,14 @@ static void xcast_recv(int status, orte_process_name_t* sender,
|
||||
*/
|
||||
jdata = orte_get_job_data_object(nm->name.jobid);
|
||||
if (NULL == (rec = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, nm->name.vpid))) {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output,
|
||||
"%s grpcomm:direct:send_relay proc %s not found - cannot relay",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&nm->name)));
|
||||
opal_output(0, "%s grpcomm:direct:send_relay proc %s not found - cannot relay",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&nm->name));
|
||||
OBJ_RELEASE(rly);
|
||||
continue;
|
||||
}
|
||||
if (ORTE_PROC_STATE_RUNNING < rec->state) {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output,
|
||||
"%s grpcomm:direct:send_relay proc %s not running - cannot relay",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&nm->name)));
|
||||
opal_output(0, "%s grpcomm:direct:send_relay proc %s not running - cannot relay",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&nm->name));
|
||||
OBJ_RELEASE(rly);
|
||||
continue;
|
||||
}
|
||||
@ -449,10 +451,12 @@ static void xcast_recv(int status, orte_process_name_t* sender,
|
||||
OBJ_DESTRUCT(&coll);
|
||||
|
||||
/* now send the relay buffer to myself for processing */
|
||||
if (ORTE_SUCCESS != (ret = orte_rml.send_buffer_nb(ORTE_PROC_MY_NAME, relay, tag,
|
||||
orte_rml_send_callback, NULL))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(relay);
|
||||
if (ORTE_DAEMON_DVM_NIDMAP_CMD != command) {
|
||||
if (ORTE_SUCCESS != (ret = orte_rml.send_buffer_nb(ORTE_PROC_MY_NAME, relay, tag,
|
||||
orte_rml_send_callback, NULL))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(relay);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -15,7 +15,7 @@
|
||||
* Copyright (c) 2010 IBM Corporation. All rights reserved.
|
||||
* Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved
|
||||
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -160,7 +160,6 @@ orte_odls_base_module_t orte_odls_alps_module = {
|
||||
orte_odls_alps_launch_local_procs,
|
||||
orte_odls_alps_kill_local_procs,
|
||||
orte_odls_alps_signal_local_procs,
|
||||
orte_odls_base_default_deliver_message,
|
||||
orte_odls_alps_restart_proc
|
||||
};
|
||||
|
||||
|
@ -122,6 +122,21 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* if this is a DVM-based launch, then don't pack all the wireup
|
||||
* info as we don't need it - just pack the job itself */
|
||||
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_FIXED_DVM, NULL, OPAL_BOOL)) {
|
||||
numjobs = 0;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &numjobs, 1, OPAL_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
/* pack the job struct */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &jdata, 1, ORTE_JOB))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* construct a nodemap - only want updated items */
|
||||
if (ORTE_SUCCESS != (rc = orte_util_encode_nodemap(&bo, true))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -757,7 +772,7 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
|
||||
}
|
||||
if (OPAL_EQUAL == opal_dss.compare(&job, &(child->name.jobid), ORTE_JOBID) &&
|
||||
j == (int)child->app_idx) {
|
||||
child->exit_code = rc;
|
||||
child->exit_code = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
|
||||
ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
|
||||
}
|
||||
}
|
||||
@ -954,7 +969,7 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
|
||||
orte_show_help("help-orte-odls-base.txt",
|
||||
"orte-odls-base:xterm-rank-out-of-bounds",
|
||||
true, nm->name.vpid, jobdat->num_procs);
|
||||
child->exit_code = ORTE_ERR_SILENT;
|
||||
child->exit_code = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
|
||||
ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
|
||||
continue;
|
||||
}
|
||||
@ -981,7 +996,7 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
|
||||
orte_show_help("help-orte-odls-base.txt",
|
||||
"orte-odls-base:fork-agent-not-found",
|
||||
true, orte_process_info.nodename, orte_fork_agent[0]);
|
||||
child->exit_code = ORTE_ERR_SILENT;
|
||||
child->exit_code = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
|
||||
ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
|
||||
continue;
|
||||
}
|
||||
@ -1014,7 +1029,7 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
|
||||
&(app->argv),
|
||||
&(app->env) ) ) ) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
child->exit_code = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
|
||||
child->exit_code = rc;
|
||||
ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
|
||||
continue;
|
||||
}
|
||||
@ -1040,7 +1055,7 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (rc = fork_local(app, child, app->env, jobdat))) {
|
||||
child->exit_code = ORTE_ERR_SILENT; /* error message already output */
|
||||
child->exit_code = rc; /* error message already output */
|
||||
ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_START);
|
||||
}
|
||||
orte_wait_cb(child, odls_base_default_wait_local_proc, NULL);
|
||||
@ -1059,7 +1074,7 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
|
||||
* across the entire cluster. Instead, we let orterun
|
||||
* output a consolidated error message for us
|
||||
*/
|
||||
child->exit_code = ORTE_ERR_SILENT; /* error message already output */
|
||||
child->exit_code = rc; /* error message already output */
|
||||
ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_START);
|
||||
continue;
|
||||
} else {
|
||||
@ -1104,57 +1119,6 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
|
||||
OBJ_RELEASE(caddy);
|
||||
}
|
||||
|
||||
int orte_odls_base_default_deliver_message(orte_jobid_t job, opal_buffer_t *buffer, orte_rml_tag_t tag)
|
||||
{
|
||||
int rc, exit_status = ORTE_SUCCESS;
|
||||
int i;
|
||||
orte_proc_t *child;
|
||||
opal_buffer_t *relay;
|
||||
|
||||
for (i=0; i < orte_local_children->size; i++) {
|
||||
if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* do we have a child from the specified job. Because the
|
||||
* job could be given as a WILDCARD value, we must use
|
||||
* the dss.compare function to check for equality.
|
||||
*/
|
||||
if (!ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE) ||
|
||||
OPAL_EQUAL != opal_dss.compare(&job, &(child->name.jobid), ORTE_JOBID)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
|
||||
"%s odls: sending message to tag %lu on child %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(unsigned long)tag, ORTE_NAME_PRINT(&child->name)));
|
||||
|
||||
/* if so, send the message */
|
||||
relay = OBJ_NEW(opal_buffer_t);
|
||||
opal_dss.copy_payload(relay, buffer);
|
||||
rc = orte_rml.send_buffer_nb(&child->name, relay, tag, orte_rml_send_callback, NULL);
|
||||
if (rc < 0 && rc != ORTE_ERR_ADDRESSEE_UNKNOWN) {
|
||||
/* ignore if the addressee is unknown as a race condition could
|
||||
* have allowed the child to exit before we send it a barrier
|
||||
* due to the vagaries of the event library.
|
||||
*
|
||||
* If we do get an error it is likely that the orte_local_children
|
||||
* has changed to reflect it, so we can no longer deliver messages.
|
||||
* So just break out and return the error code.
|
||||
*/
|
||||
ORTE_ERROR_LOG(rc);
|
||||
exit_status = rc;
|
||||
OBJ_RELEASE(relay);
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
|
||||
cleanup:
|
||||
return exit_status;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Pass a signal to my local procs
|
||||
*/
|
||||
@ -1412,23 +1376,6 @@ void odls_base_default_wait_local_proc(orte_proc_t *proc, void* cbdata)
|
||||
ORTE_ACTIVATE_PROC_STATE(&proc->name, state);
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
orte_proc_t *child;
|
||||
orte_odls_base_kill_local_fn_t kill_local;
|
||||
} odls_kill_caddy_t;
|
||||
|
||||
static void kill_cbfunc(int fd, short args, void *cbdata)
|
||||
{
|
||||
odls_kill_caddy_t *cd = (odls_kill_caddy_t*)cbdata;
|
||||
|
||||
if (!ORTE_FLAG_TEST(cd->child, ORTE_PROC_FLAG_ALIVE) || 0 == cd->child->pid) {
|
||||
free(cd);
|
||||
return;
|
||||
}
|
||||
cd->kill_local(cd->child->pid, SIGKILL);
|
||||
free(cd);
|
||||
}
|
||||
|
||||
int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
|
||||
orte_odls_base_kill_local_fn_t kill_local,
|
||||
orte_odls_base_child_died_fn_t child_died)
|
||||
@ -1555,48 +1502,17 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
|
||||
*/
|
||||
orte_wait_cb_cancel(child);
|
||||
|
||||
if (!do_cleanup) {
|
||||
odls_kill_caddy_t *cd;
|
||||
|
||||
/* if we are killing only selected procs, then do so in a gentle
|
||||
fashion. First send a SIGCONT in case the process is in stopped state.
|
||||
If it is in a stopped state and we do not first change it to
|
||||
running, then SIGTERM will not get delivered. Ignore return
|
||||
value. */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
|
||||
"%s SENDING SIGCONT TO %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&child->name)));
|
||||
kill_local(child->pid, SIGCONT);
|
||||
|
||||
/* Send a sigterm to the process before sigkill to be nice */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
|
||||
"%s SENDING SIGTERM TO %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&child->name)));
|
||||
kill_local(child->pid, SIGTERM);
|
||||
/* provide a polite delay so the proc has a chance to react */
|
||||
cd = (odls_kill_caddy_t*)malloc(sizeof(odls_kill_caddy_t));
|
||||
OBJ_RETAIN(child); // protect against race conditions
|
||||
cd->child = child;
|
||||
cd->kill_local = kill_local;
|
||||
ORTE_TIMER_EVENT(1, 0, kill_cbfunc, ORTE_SYS_PRI);
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Force the SIGKILL just to make sure things are dead
|
||||
/* Use SIGKILL just to make sure things are dead
|
||||
* This fixes an issue that, if the application is masking
|
||||
* SIGTERM, then the child_died()
|
||||
* may return 'true' even though waipid returns with 0.
|
||||
* It does this to avoid a race condition, per documentation
|
||||
* in odls_default_module.c.
|
||||
* SIGTERM, then the child_died() may return 'true' even
|
||||
* though waipid returns with 0. It does this to avoid a
|
||||
* race condition, per documentation in odls_default_module.c.
|
||||
*/
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
|
||||
"%s SENDING FORCE SIGKILL TO %s pid %lu",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&child->name), (unsigned long)child->pid));
|
||||
kill_local(child->pid, SIGKILL);
|
||||
|
||||
/* indicate the waitpid fired as this is effectively what
|
||||
* has happened
|
||||
*/
|
||||
|
@ -9,9 +9,10 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2016 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -103,9 +104,6 @@ OBJ_CLASS_DECLARATION(orte_odls_launch_local_t);
|
||||
|
||||
ORTE_DECLSPEC void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata);
|
||||
|
||||
ORTE_DECLSPEC int
|
||||
orte_odls_base_default_deliver_message(orte_jobid_t job, opal_buffer_t *buffer, orte_rml_tag_t tag);
|
||||
|
||||
ORTE_DECLSPEC void odls_base_default_wait_local_proc(orte_proc_t *proc, void* cbdata);
|
||||
|
||||
/* define a function type to signal a local proc */
|
||||
|
@ -15,7 +15,7 @@
|
||||
* Copyright (c) 2010 IBM Corporation. All rights reserved.
|
||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved
|
||||
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -160,7 +160,6 @@ orte_odls_base_module_t orte_odls_default_module = {
|
||||
orte_odls_default_launch_local_procs,
|
||||
orte_odls_default_kill_local_procs,
|
||||
orte_odls_default_signal_local_procs,
|
||||
orte_odls_base_default_deliver_message,
|
||||
orte_odls_default_restart_proc
|
||||
};
|
||||
|
||||
|
@ -12,6 +12,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2016 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -74,12 +75,6 @@ typedef int (*orte_odls_base_module_kill_local_processes_fn_t)(opal_pointer_arra
|
||||
typedef int (*orte_odls_base_module_signal_local_process_fn_t)(const orte_process_name_t *proc,
|
||||
int32_t signal);
|
||||
|
||||
/**
|
||||
* Deliver a message to local processes
|
||||
*/
|
||||
typedef int (*orte_odls_base_module_deliver_message_fn_t)(orte_jobid_t job, opal_buffer_t *buffer,
|
||||
orte_rml_tag_t tag);
|
||||
|
||||
/**
|
||||
* Restart a local process
|
||||
*/
|
||||
@ -93,7 +88,6 @@ struct orte_odls_base_module_1_3_0_t {
|
||||
orte_odls_base_module_launch_local_processes_fn_t launch_local_procs;
|
||||
orte_odls_base_module_kill_local_processes_fn_t kill_local_procs;
|
||||
orte_odls_base_module_signal_local_process_fn_t signal_local_procs;
|
||||
orte_odls_base_module_deliver_message_fn_t deliver_message;
|
||||
orte_odls_base_module_restart_proc_fn_t restart_proc;
|
||||
};
|
||||
|
||||
|
@ -12,7 +12,7 @@
|
||||
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2014 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -48,7 +48,6 @@ typedef uint8_t orte_daemon_cmd_flag_t;
|
||||
#define ORTE_DAEMON_HEARTBEAT_CMD (orte_daemon_cmd_flag_t) 6
|
||||
#define ORTE_DAEMON_EXIT_CMD (orte_daemon_cmd_flag_t) 7
|
||||
#define ORTE_DAEMON_PROCESS_AND_RELAY_CMD (orte_daemon_cmd_flag_t) 9
|
||||
#define ORTE_DAEMON_MESSAGE_LOCAL_PROCS (orte_daemon_cmd_flag_t) 10
|
||||
#define ORTE_DAEMON_NULL_CMD (orte_daemon_cmd_flag_t) 11
|
||||
|
||||
/* commands for use by tools */
|
||||
@ -59,6 +58,8 @@ typedef uint8_t orte_daemon_cmd_flag_t;
|
||||
#define ORTE_DAEMON_TERMINATE_JOB_CMD (orte_daemon_cmd_flag_t) 18
|
||||
#define ORTE_DAEMON_HALT_VM_CMD (orte_daemon_cmd_flag_t) 19
|
||||
#define ORTE_DAEMON_HALT_DVM_CMD (orte_daemon_cmd_flag_t) 20
|
||||
#define ORTE_DAEMON_REPORT_JOB_COMPLETE (orte_daemon_cmd_flag_t) 21
|
||||
|
||||
|
||||
/* request proc resource usage */
|
||||
#define ORTE_DAEMON_TOP_CMD (orte_daemon_cmd_flag_t) 22
|
||||
@ -74,9 +75,10 @@ typedef uint8_t orte_daemon_cmd_flag_t;
|
||||
/* process called "errmgr.abort_procs" */
|
||||
#define ORTE_DAEMON_ABORT_PROCS_CALLED (orte_daemon_cmd_flag_t) 28
|
||||
|
||||
/* new daemon collective id */
|
||||
#define ORTE_DAEMON_NEW_COLL_ID (orte_daemon_cmd_flag_t) 29
|
||||
|
||||
/* nidmap for the DVM */
|
||||
#define ORTE_DAEMON_DVM_NIDMAP_CMD (orte_daemon_cmd_flag_t) 29
|
||||
/* add procs for the DVM */
|
||||
#define ORTE_DAEMON_DVM_ADD_PROCS (orte_daemon_cmd_flag_t) 30
|
||||
|
||||
/*
|
||||
* Struct written up the pipe from the child to the parent.
|
||||
|
@ -13,7 +13,7 @@
|
||||
* Copyright (c) 2009 Institut National de Recherche en Informatique
|
||||
* et Automatique. All rights reserved.
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
|
||||
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2015 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
@ -104,6 +104,9 @@ void orte_plm_base_daemons_reported(int fd, short args, void *cbdata)
|
||||
if (NULL == (node = dmn1->node) ||
|
||||
NULL == (t = node->topology)) {
|
||||
/* something is wrong */
|
||||
opal_output(0, "NODE IS %s T IS %s",
|
||||
(NULL == node) ? "NULL" : "NOT-NULL",
|
||||
(NULL == t) ? "NULL" : "NOT-NULL");
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
ORTE_FORCED_TERMINATE(ORTE_ERR_NOT_FOUND);
|
||||
OBJ_RELEASE(caddy);
|
||||
@ -512,8 +515,12 @@ void orte_plm_base_launch_apps(int fd, short args, void *cbdata)
|
||||
/* setup the buffer */
|
||||
buffer = OBJ_NEW(opal_buffer_t);
|
||||
|
||||
/* pack the add_local_procs command */
|
||||
command = ORTE_DAEMON_ADD_LOCAL_PROCS;
|
||||
/* pack the appropriate add_local_procs command */
|
||||
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_FIXED_DVM, NULL, OPAL_BOOL)) {
|
||||
command = ORTE_DAEMON_DVM_ADD_PROCS;
|
||||
} else {
|
||||
command = ORTE_DAEMON_ADD_LOCAL_PROCS;
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &command, 1, ORTE_DAEMON_CMD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(buffer);
|
||||
@ -637,9 +644,10 @@ void orte_plm_base_post_launch(int fd, short args, void *cbdata)
|
||||
* it won't register and we need to send the response now.
|
||||
* Otherwise, it is an MPI job and we should wait for it
|
||||
* to register */
|
||||
if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_NON_ORTE_JOB, NULL, OPAL_BOOL)) {
|
||||
if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_NON_ORTE_JOB, NULL, OPAL_BOOL) &&
|
||||
!orte_get_attribute(&jdata->attributes, ORTE_JOB_DVM_JOB, NULL, OPAL_BOOL)) {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
|
||||
"%s plm:base:launch job %s is not MPI",
|
||||
"%s plm:base:launch job %s is MPI",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(jdata->jobid)));
|
||||
goto cleanup;
|
||||
@ -724,13 +732,16 @@ void orte_plm_base_registered(int fd, short args, void *cbdata)
|
||||
caddy->jdata->state = caddy->job_state;
|
||||
|
||||
/* if this isn't a dynamic spawn, just cleanup */
|
||||
if (ORTE_JOBID_INVALID == jdata->originator.jobid) {
|
||||
if (ORTE_JOBID_INVALID == jdata->originator.jobid ||
|
||||
orte_get_attribute(&jdata->attributes, ORTE_JOB_NON_ORTE_JOB, NULL, OPAL_BOOL) ||
|
||||
orte_get_attribute(&jdata->attributes, ORTE_JOB_DVM_JOB, NULL, OPAL_BOOL)) {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
|
||||
"%s plm:base:launch job %s is not a dynamic spawn",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(jdata->jobid)));
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* if it was a dynamic spawn, send the response */
|
||||
rc = ORTE_SUCCESS;
|
||||
answer = OBJ_NEW(opal_buffer_t);
|
||||
@ -1529,6 +1540,15 @@ int orte_plm_base_setup_virtual_machine(orte_job_t *jdata)
|
||||
}
|
||||
map = daemons->map;
|
||||
|
||||
/* if this job is being launched against a fixed DVM, then there is
|
||||
* nothing for us to do - the DVM will stand as is */
|
||||
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_FIXED_DVM, NULL, OPAL_BOOL)) {
|
||||
/* mark that the daemons have reported so we can proceed */
|
||||
daemons->state = ORTE_JOB_STATE_DAEMONS_REPORTED;
|
||||
map->num_new_daemons = 0;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* if this is a dynamic spawn, then we don't make any changes to
|
||||
* the virtual machine unless specifically requested to do so
|
||||
*/
|
||||
|
@ -12,7 +12,7 @@
|
||||
* Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2009-2016 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -151,7 +151,7 @@ BEGIN_C_DECLS
|
||||
|
||||
/* notifier support */
|
||||
#define ORTE_RML_TAG_NOTIFIER_HNP 52
|
||||
#define ORTE_RML_TAG_CONFIRM_SPAWN 53
|
||||
#define ORTE_RML_TAG_NOTIFY_COMPLETE 53
|
||||
|
||||
/*** QOS specific RML TAGS ***/
|
||||
#define ORTE_RML_TAG_OPEN_CHANNEL_REQ 54
|
||||
|
@ -757,10 +757,10 @@ void orte_state_base_check_all_complete(int fd, short args, void *cbdata)
|
||||
* is maintained!
|
||||
*/
|
||||
if (1 < j) {
|
||||
if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) {
|
||||
/* this was a debugger daemon. notify that a debugger has detached */
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DEBUGGER_DETACH);
|
||||
}
|
||||
if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) {
|
||||
/* this was a debugger daemon. notify that a debugger has detached */
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DEBUGGER_DETACH);
|
||||
}
|
||||
opal_pointer_array_set_item(orte_job_data, j, NULL); /* ensure the array has a NULL */
|
||||
OBJ_RELEASE(jdata);
|
||||
}
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2015 Intel, Inc. All rights reserved
|
||||
* Copyright (c) 2015-2016 Intel, Inc. All rights reserved
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -20,11 +20,14 @@
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/filem/filem.h"
|
||||
#include "orte/mca/grpcomm/grpcomm.h"
|
||||
#include "orte/mca/iof/iof.h"
|
||||
#include "orte/mca/odls/odls_types.h"
|
||||
#include "orte/mca/plm/base/base.h"
|
||||
#include "orte/mca/ras/base/base.h"
|
||||
#include "orte/mca/rmaps/base/base.h"
|
||||
#include "orte/mca/routed/routed.h"
|
||||
#include "orte/util/nidmap.h"
|
||||
#include "orte/util/session_dir.h"
|
||||
#include "orte/runtime/orte_quit.h"
|
||||
|
||||
@ -40,8 +43,10 @@ static int init(void);
|
||||
static int finalize(void);
|
||||
|
||||
/* local functions */
|
||||
static void init_complete(int fd, short args, void *cbdata);
|
||||
static void vm_ready(int fd, short args, void *cbata);
|
||||
void check_complete(int fd, short args, void *cbdata);
|
||||
static void check_complete(int fd, short args, void *cbdata);
|
||||
static void cleanup_job(int fd, short args, void *cbdata);
|
||||
|
||||
/******************
|
||||
* DVM module - used when mpirun is persistent
|
||||
@ -86,7 +91,7 @@ static orte_job_state_t launch_states[] = {
|
||||
};
|
||||
static orte_state_cbfunc_t launch_callbacks[] = {
|
||||
orte_plm_base_setup_job,
|
||||
orte_plm_base_setup_job_complete,
|
||||
init_complete,
|
||||
orte_ras_base_allocate,
|
||||
orte_plm_base_allocation_complete,
|
||||
orte_plm_base_daemons_launched,
|
||||
@ -100,7 +105,7 @@ static orte_state_cbfunc_t launch_callbacks[] = {
|
||||
orte_plm_base_post_launch,
|
||||
orte_plm_base_registered,
|
||||
check_complete,
|
||||
orte_state_base_cleanup_job,
|
||||
cleanup_job,
|
||||
orte_quit
|
||||
};
|
||||
|
||||
@ -210,12 +215,105 @@ static void files_ready(int status, void *cbdata)
|
||||
}
|
||||
}
|
||||
|
||||
static void vm_ready(int fd, short args, void *cbdata)
|
||||
static void init_complete(int sd, short args, void *cbdata)
|
||||
{
|
||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||
|
||||
/* nothing to do here but move along - if it is the
|
||||
* daemon job, then next step is allocate */
|
||||
if (caddy->jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
|
||||
ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_ALLOCATE);
|
||||
} else {
|
||||
/* next step - position any required files */
|
||||
if (ORTE_SUCCESS != orte_filem.preposition_files(caddy->jdata, files_ready, caddy->jdata)) {
|
||||
ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
}
|
||||
}
|
||||
OBJ_RELEASE(caddy);
|
||||
}
|
||||
|
||||
static void vm_ready(int fd, short args, void *cbdata)
|
||||
{
|
||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||
int rc;
|
||||
opal_buffer_t *buf;
|
||||
orte_daemon_cmd_flag_t command = ORTE_DAEMON_DVM_NIDMAP_CMD;
|
||||
orte_grpcomm_signature_t *sig;
|
||||
opal_buffer_t *wireup;
|
||||
opal_byte_object_t bo, *boptr;
|
||||
int8_t flag;
|
||||
int32_t numbytes;
|
||||
|
||||
/* if this is my job, then we are done */
|
||||
if (ORTE_PROC_MY_NAME->jobid == caddy->jdata->jobid) {
|
||||
/* send the daemon map to every daemon in this DVM - we
|
||||
* do this here so we don't have to do it for every
|
||||
* job we are going to launch */
|
||||
buf = OBJ_NEW(opal_buffer_t);
|
||||
/* pack the "load nidmap" cmd */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &command, 1, ORTE_DAEMON_CMD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(buf);
|
||||
return;
|
||||
}
|
||||
/* construct a nodemap with everything in it */
|
||||
if (ORTE_SUCCESS != (rc = orte_util_encode_nodemap(&bo, false))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(buf);
|
||||
return;
|
||||
}
|
||||
|
||||
/* store it */
|
||||
boptr = &bo;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &boptr, 1, OPAL_BYTE_OBJECT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(buf);
|
||||
return;
|
||||
}
|
||||
/* release the data since it has now been copied into our buffer */
|
||||
free(bo.bytes);
|
||||
|
||||
/* pack a flag indicating wiring info is provided */
|
||||
flag = 1;
|
||||
opal_dss.pack(buf, &flag, 1, OPAL_INT8);
|
||||
/* get wireup info for daemons per the selected routing module */
|
||||
wireup = OBJ_NEW(opal_buffer_t);
|
||||
if (ORTE_SUCCESS != (rc = orte_routed.get_wireup_info(wireup))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(wireup);
|
||||
OBJ_RELEASE(buf);
|
||||
return;
|
||||
}
|
||||
/* put it in a byte object for xmission */
|
||||
opal_dss.unload(wireup, (void**)&bo.bytes, &numbytes);
|
||||
/* pack the byte object - zero-byte objects are fine */
|
||||
bo.size = numbytes;
|
||||
boptr = &bo;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &boptr, 1, OPAL_BYTE_OBJECT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(wireup);
|
||||
OBJ_RELEASE(buf);
|
||||
return;
|
||||
}
|
||||
/* release the data since it has now been copied into our buffer */
|
||||
if (NULL != bo.bytes) {
|
||||
free(bo.bytes);
|
||||
}
|
||||
OBJ_RELEASE(wireup);
|
||||
|
||||
/* goes to all daemons */
|
||||
sig = OBJ_NEW(orte_grpcomm_signature_t);
|
||||
sig->signature = (orte_process_name_t*)malloc(sizeof(orte_process_name_t));
|
||||
sig->signature[0].jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
sig->signature[0].vpid = ORTE_VPID_WILDCARD;
|
||||
if (ORTE_SUCCESS != (rc = orte_grpcomm.xcast(sig, ORTE_RML_TAG_DAEMON, buf))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(buf);
|
||||
OBJ_RELEASE(sig);
|
||||
ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
return;
|
||||
}
|
||||
OBJ_RELEASE(buf);
|
||||
/* notify that the vm is ready */
|
||||
fprintf(stdout, "DVM ready\n");
|
||||
OBJ_RELEASE(caddy);
|
||||
@ -234,92 +332,27 @@ static void vm_ready(int fd, short args, void *cbdata)
|
||||
OBJ_RELEASE(caddy);
|
||||
}
|
||||
|
||||
void check_complete(int fd, short args, void *cbdata)
|
||||
static void check_complete(int fd, short args, void *cbdata)
|
||||
{
|
||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||
orte_job_t *jdata = caddy->jdata;
|
||||
|
||||
orte_proc_t *proc;
|
||||
int i;
|
||||
orte_std_cntr_t j;
|
||||
orte_job_t *job;
|
||||
orte_node_t *node;
|
||||
orte_job_map_t *map;
|
||||
orte_std_cntr_t index;
|
||||
bool one_still_alive;
|
||||
orte_vpid_t lowest=0;
|
||||
int32_t i32, *i32ptr;
|
||||
|
||||
opal_output_verbose(2, orte_state_base_framework.framework_output,
|
||||
"%s state:base:check_job_complete on job %s",
|
||||
"%s state:dvm:check_job_complete on job %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(NULL == jdata) ? "NULL" : ORTE_JOBID_PRINT(jdata->jobid));
|
||||
|
||||
if (NULL == jdata || jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
|
||||
/* just check to see if the daemons are complete */
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
|
||||
"%s state:base:check_job_complete - received NULL job, checking daemons",
|
||||
"%s state:dvm:check_job_complete - received NULL job, checking daemons",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
goto CHECK_DAEMONS;
|
||||
} else {
|
||||
/* mark the job as terminated, but don't override any
|
||||
* abnormal termination flags
|
||||
*/
|
||||
if (jdata->state < ORTE_JOB_STATE_UNTERMINATED) {
|
||||
jdata->state = ORTE_JOB_STATE_TERMINATED;
|
||||
}
|
||||
}
|
||||
|
||||
/* tell the IOF that the job is complete */
|
||||
if (NULL != orte_iof.complete) {
|
||||
orte_iof.complete(jdata);
|
||||
}
|
||||
|
||||
i32ptr = &i32;
|
||||
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, (void**)&i32ptr, OPAL_INT32) && !orte_abort_non_zero_exit) {
|
||||
if (!orte_report_child_jobs_separately || 1 == ORTE_LOCAL_JOBID(jdata->jobid)) {
|
||||
/* update the exit code */
|
||||
ORTE_UPDATE_EXIT_STATUS(lowest);
|
||||
}
|
||||
|
||||
/* warn user */
|
||||
opal_output(orte_clean_output,
|
||||
"-------------------------------------------------------\n"
|
||||
"While %s job %s terminated normally, %d %s. Further examination may be required.\n"
|
||||
"-------------------------------------------------------",
|
||||
(1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "the primary" : "child",
|
||||
(1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "" : ORTE_LOCAL_JOBID_PRINT(jdata->jobid),
|
||||
i32, (1 == i32) ? "process returned\na non-zero exit code." :
|
||||
"processes returned\nnon-zero exit codes.");
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
|
||||
"%s state:base:check_job_completed declared job %s terminated with state %s - checking all jobs",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(jdata->jobid),
|
||||
orte_job_state_to_str(jdata->state)));
|
||||
|
||||
/* if this job is a continuously operating one, then don't do
|
||||
* anything further - just return here
|
||||
*/
|
||||
if (NULL != jdata &&
|
||||
(orte_get_attribute(&jdata->attributes, ORTE_JOB_CONTINUOUS_OP, NULL, OPAL_BOOL) ||
|
||||
ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RECOVERABLE))) {
|
||||
goto CHECK_ALIVE;
|
||||
}
|
||||
|
||||
/* if the job that is being checked is the HNP, then we are
|
||||
* trying to terminate the orteds. In that situation, we
|
||||
* do -not- check all jobs - we simply notify the HNP
|
||||
* that the orteds are complete. Also check special case
|
||||
* if jdata is NULL - we want
|
||||
* to definitely declare the job done if the orteds
|
||||
* have completed, no matter what else may be happening.
|
||||
* This can happen if a ctrl-c hits in the "wrong" place
|
||||
* while launching
|
||||
*/
|
||||
CHECK_DAEMONS:
|
||||
if (jdata == NULL || jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
|
||||
if (0 == orte_routed.num_routes()) {
|
||||
/* orteds are done! */
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
|
||||
@ -336,6 +369,18 @@ void check_complete(int fd, short args, void *cbdata)
|
||||
return;
|
||||
}
|
||||
|
||||
/* mark the job as terminated, but don't override any
|
||||
* abnormal termination flags
|
||||
*/
|
||||
if (jdata->state < ORTE_JOB_STATE_UNTERMINATED) {
|
||||
jdata->state = ORTE_JOB_STATE_TERMINATED;
|
||||
}
|
||||
|
||||
/* tell the IOF that the job is complete */
|
||||
if (NULL != orte_iof.complete) {
|
||||
orte_iof.complete(jdata);
|
||||
}
|
||||
|
||||
/* Release the resources used by this job. Since some errmgrs may want
|
||||
* to continue using resources allocated to the job as part of their
|
||||
* fault recovery procedure, we only do this once the job is "complete".
|
||||
@ -388,114 +433,25 @@ void check_complete(int fd, short args, void *cbdata)
|
||||
}
|
||||
}
|
||||
|
||||
CHECK_ALIVE:
|
||||
/* now check to see if all jobs are done - trigger notification of this jdata
|
||||
* object when we find it
|
||||
*/
|
||||
one_still_alive = false;
|
||||
for (j=1; j < orte_job_data->size; j++) {
|
||||
if (NULL == (job = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, j))) {
|
||||
/* since we are releasing jdata objects as we
|
||||
* go, we can no longer assume that the job_data
|
||||
* array is left justified
|
||||
*/
|
||||
continue;
|
||||
}
|
||||
/* if this is the job we are checking AND it normally terminated,
|
||||
* then activate the "notify_completed" state - this will release
|
||||
* the job state, but is provided so that the HNP main code can
|
||||
* take alternative actions if desired. If the state is killed_by_cmd,
|
||||
* then go ahead and release it. We cannot release it if it
|
||||
* abnormally terminated as mpirun needs the info so it can
|
||||
* report appropriately to the user
|
||||
*
|
||||
* NOTE: do not release the primary job (j=1) so we
|
||||
* can pretty-print completion message
|
||||
*/
|
||||
if (NULL != jdata && job->jobid == jdata->jobid) {
|
||||
if (jdata->state == ORTE_JOB_STATE_TERMINATED) {
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
|
||||
"%s state:base:check_job_completed state is terminated - activating notify",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_NOTIFY_COMPLETED);
|
||||
one_still_alive = true;
|
||||
} else if (jdata->state == ORTE_JOB_STATE_KILLED_BY_CMD ||
|
||||
jdata->state == ORTE_JOB_STATE_NOTIFIED) {
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
|
||||
"%s state:base:check_job_completed state is killed or notified - cleaning up",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
/* release this object, ensuring that the
|
||||
* pointer array internal accounting
|
||||
* is maintained!
|
||||
*/
|
||||
if (1 < j) {
|
||||
if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) {
|
||||
/* this was a debugger daemon. notify that a debugger has detached */
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DEBUGGER_DETACH);
|
||||
}
|
||||
opal_pointer_array_set_item(orte_job_data, j, NULL); /* ensure the array has a NULL */
|
||||
OBJ_RELEASE(jdata);
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
/* if the job is flagged to not be monitored, skip it */
|
||||
if (ORTE_FLAG_TEST(job, ORTE_JOB_FLAG_DO_NOT_MONITOR)) {
|
||||
continue;
|
||||
}
|
||||
/* when checking for job termination, we must be sure to NOT check
|
||||
* our own job as it - rather obviously - has NOT terminated!
|
||||
*/
|
||||
if (job->num_terminated < job->num_procs) {
|
||||
/* we have at least one job that is not done yet - we cannot
|
||||
* just return, though, as we need to ensure we cleanout the
|
||||
* job data for the job that just completed
|
||||
*/
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
|
||||
"%s state:base:check_job_completed job %s is not terminated (%d:%d)",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(job->jobid),
|
||||
job->num_terminated, job->num_procs));
|
||||
one_still_alive = true;
|
||||
}
|
||||
else {
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
|
||||
"%s state:base:check_job_completed job %s is terminated (%d vs %d [%s])",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(job->jobid),
|
||||
job->num_terminated, job->num_procs,
|
||||
(NULL == jdata) ? "UNKNOWN" : orte_job_state_to_str(jdata->state) ));
|
||||
}
|
||||
}
|
||||
/* if a job is still alive, we just return */
|
||||
if (one_still_alive) {
|
||||
if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) {
|
||||
/* this was a debugger daemon. notify that a debugger has detached */
|
||||
OBJ_RETAIN(jdata);
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DEBUGGER_DETACH);
|
||||
} else if (jdata->state != ORTE_JOB_STATE_NOTIFIED) {
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
|
||||
"%s state:base:check_job_completed at least one job is not terminated",
|
||||
"%s state:dvm:check_job_completed state is terminated - activating notify",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
OBJ_RELEASE(caddy);
|
||||
return;
|
||||
OBJ_RETAIN(jdata);
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_NOTIFY_COMPLETED);
|
||||
/* mark the job as notified */
|
||||
jdata->state = ORTE_JOB_STATE_NOTIFIED;
|
||||
}
|
||||
/* if we get here, then all jobs are done, so terminate */
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
|
||||
"%s state:base:check_job_completed all jobs terminated",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
/* stop the job timeout event, if set */
|
||||
if (NULL != orte_mpiexec_timeout) {
|
||||
OBJ_RELEASE(orte_mpiexec_timeout);
|
||||
orte_mpiexec_timeout = NULL;
|
||||
}
|
||||
|
||||
/* set the exit status to 0 - this will only happen if it
|
||||
* wasn't already set by an error condition
|
||||
*/
|
||||
ORTE_UPDATE_EXIT_STATUS(0);
|
||||
|
||||
/* order daemon termination - this tells us to cleanup
|
||||
* our local procs as well as telling remote daemons
|
||||
* to die
|
||||
*/
|
||||
orte_plm.terminate_orteds();
|
||||
|
||||
OBJ_RELEASE(caddy);
|
||||
}
|
||||
|
||||
static void cleanup_job(int sd, short args, void *cbdata)
|
||||
{
|
||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||
OBJ_RELEASE(caddy);
|
||||
}
|
||||
|
@ -10,7 +10,8 @@
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2014 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2014 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2015 Intel, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
@ -23,10 +24,12 @@
|
||||
dist_ortedata_DATA += orted/help-orted.txt
|
||||
|
||||
headers += \
|
||||
orted/orted.h
|
||||
orted/orted.h \
|
||||
orted/orted_submit.h
|
||||
|
||||
lib@ORTE_LIB_PREFIX@open_rte_la_SOURCES += \
|
||||
orted/orted_main.c \
|
||||
orted/orted_comm.c
|
||||
orted/orted_comm.c \
|
||||
orted/orted_submit.c
|
||||
|
||||
include orted/pmix/Makefile.am
|
||||
|
@ -14,7 +14,7 @@
|
||||
* reserved.
|
||||
* Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
|
||||
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
|
||||
* Copyright (c) 2014 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -95,10 +95,8 @@ void orte_daemon_recv(int status, orte_process_name_t* sender,
|
||||
orte_std_cntr_t n;
|
||||
int32_t signal;
|
||||
orte_jobid_t job;
|
||||
orte_rml_tag_t target_tag;
|
||||
char *contact_info;
|
||||
opal_buffer_t *answer;
|
||||
orte_rml_cmd_flag_t rml_cmd;
|
||||
orte_job_t *jdata;
|
||||
orte_process_name_t proc, proc2;
|
||||
orte_process_name_t *return_addr;
|
||||
@ -228,6 +226,7 @@ void orte_daemon_recv(int status, orte_process_name_t* sender,
|
||||
|
||||
/**** ADD_LOCAL_PROCS ****/
|
||||
case ORTE_DAEMON_ADD_LOCAL_PROCS:
|
||||
case ORTE_DAEMON_DVM_ADD_PROCS:
|
||||
if (orte_debug_daemons_flag) {
|
||||
opal_output(0, "%s orted_cmd: received add_local_procs",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
@ -340,87 +339,6 @@ void orte_daemon_recv(int status, orte_process_name_t* sender,
|
||||
}
|
||||
break;
|
||||
|
||||
/**** DELIVER A MESSAGE TO THE LOCAL PROCS ****/
|
||||
case ORTE_DAEMON_MESSAGE_LOCAL_PROCS:
|
||||
if (orte_debug_daemons_flag) {
|
||||
opal_output(0, "%s orted_cmd: received message_local_procs",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
}
|
||||
|
||||
/* unpack the jobid of the procs that are to receive the message */
|
||||
n = 1;
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &job, &n, ORTE_JOBID))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
/* unpack the tag where we are to deliver the message */
|
||||
n = 1;
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &target_tag, &n, ORTE_RML_TAG))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_debug_output,
|
||||
"%s orted:comm:message_local_procs delivering message to job %s tag %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(job), (int)target_tag));
|
||||
|
||||
relay_msg = OBJ_NEW(opal_buffer_t);
|
||||
opal_dss.copy_payload(relay_msg, buffer);
|
||||
|
||||
/* if job=my_jobid, then this message is for us and not for our children */
|
||||
if (ORTE_PROC_MY_NAME->jobid == job) {
|
||||
/* if the target tag is our xcast_barrier or rml_update, then we have
|
||||
* to handle the message as a special case. The RML has logic in it
|
||||
* intended to make it easier to use. This special logic mandates that
|
||||
* any message we "send" actually only goes into the queue for later
|
||||
* transmission. Thus, since we are already in a recv when we enter
|
||||
* the "process_commands" function, any attempt to "send" the relay
|
||||
* buffer to ourselves will only be added to the queue - it won't
|
||||
* actually be delivered until *after* we conclude the processing
|
||||
* of the current recv.
|
||||
*
|
||||
* The problem here is that, for messages where we need to relay
|
||||
* them along the orted chain, the rml_update
|
||||
* message contains contact info we may well need in order to do
|
||||
* the relay! So we need to process those messages immediately.
|
||||
* The only way to accomplish that is to (a) detect that the
|
||||
* buffer is intended for those tags, and then (b) process
|
||||
* those buffers here.
|
||||
*
|
||||
*/
|
||||
if (ORTE_RML_TAG_RML_INFO_UPDATE == target_tag) {
|
||||
n = 1;
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.unpack(relay_msg, &rml_cmd, &n, ORTE_RML_CMD))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
goto CLEANUP;
|
||||
}
|
||||
/* initialize the routes to my peers - this will update the number
|
||||
* of daemons in the system (i.e., orte_process_info.num_procs) as
|
||||
* this might have changed
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, relay_msg))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
goto CLEANUP;
|
||||
}
|
||||
} else {
|
||||
/* just deliver it to ourselves */
|
||||
if ((ret = orte_rml.send_buffer_nb(ORTE_PROC_MY_NAME, relay_msg, target_tag,
|
||||
orte_rml_send_callback, NULL)) < 0) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(relay_msg);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* must be for our children - deliver the message */
|
||||
if (ORTE_SUCCESS != (ret = orte_odls.deliver_message(job, relay_msg, target_tag))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
}
|
||||
OBJ_RELEASE(relay_msg);
|
||||
}
|
||||
break;
|
||||
|
||||
/**** EXIT COMMAND ****/
|
||||
case ORTE_DAEMON_EXIT_CMD:
|
||||
if (orte_debug_daemons_flag) {
|
||||
@ -518,22 +436,22 @@ void orte_daemon_recv(int status, orte_process_name_t* sender,
|
||||
opal_output(0, "%s orted_cmd: received spawn job",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
}
|
||||
answer = OBJ_NEW(opal_buffer_t);
|
||||
job = ORTE_JOBID_INVALID;
|
||||
/* can only process this if we are the HNP */
|
||||
if (ORTE_PROC_IS_HNP) {
|
||||
/* unpack the job data */
|
||||
n = 1;
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &jdata, &n, ORTE_JOB))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
goto ANSWER_LAUNCH;
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_FAILED_TO_LAUNCH);
|
||||
break;
|
||||
}
|
||||
/* point the originator to the sender */
|
||||
jdata->originator = *sender;
|
||||
/* assign a jobid to it */
|
||||
if (ORTE_SUCCESS != (ret = orte_plm_base_create_jobid(jdata))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
goto ANSWER_LAUNCH;
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_FAILED_TO_LAUNCH);
|
||||
break;
|
||||
}
|
||||
/* store it on the global job data pool */
|
||||
opal_pointer_array_set_item(orte_job_data, ORTE_LOCAL_JOBID(jdata->jobid), jdata);
|
||||
@ -550,7 +468,8 @@ void orte_daemon_recv(int status, orte_process_name_t* sender,
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(iofbuf, &ioftag, 1, ORTE_IOF_TAG))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(iofbuf);
|
||||
goto ANSWER_LAUNCH;
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_FAILED_TO_LAUNCH);
|
||||
break;
|
||||
}
|
||||
/* pack the name of the source */
|
||||
source.jobid = jdata->jobid;
|
||||
@ -558,13 +477,15 @@ void orte_daemon_recv(int status, orte_process_name_t* sender,
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(iofbuf, &source, 1, ORTE_NAME))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(iofbuf);
|
||||
goto ANSWER_LAUNCH;
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_FAILED_TO_LAUNCH);
|
||||
break;
|
||||
}
|
||||
/* pack the sender as the sink */
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(iofbuf, sender, 1, ORTE_NAME))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(iofbuf);
|
||||
goto ANSWER_LAUNCH;
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_FAILED_TO_LAUNCH);
|
||||
break;
|
||||
}
|
||||
/* send the buffer to our IOF */
|
||||
orte_rml.send_buffer_nb(ORTE_PROC_MY_NAME, iofbuf, ORTE_RML_TAG_IOF_HNP,
|
||||
@ -578,22 +499,39 @@ void orte_daemon_recv(int status, orte_process_name_t* sender,
|
||||
/* now launch the job - this will just push it into our state machine */
|
||||
if (ORTE_SUCCESS != (ret = orte_plm.spawn(jdata))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
goto ANSWER_LAUNCH;
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_FAILED_TO_LAUNCH);
|
||||
break;
|
||||
}
|
||||
job = jdata->jobid;
|
||||
}
|
||||
ANSWER_LAUNCH:
|
||||
/* pack the jobid to be returned */
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &job, 1, ORTE_JOBID))) {
|
||||
break;
|
||||
|
||||
|
||||
/**** TERMINATE JOB COMMAND ****/
|
||||
case ORTE_DAEMON_TERMINATE_JOB_CMD:
|
||||
|
||||
/* unpack the jobid */
|
||||
n = 1;
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &job, &n, ORTE_JOBID))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(answer);
|
||||
goto CLEANUP;
|
||||
}
|
||||
/* return response */
|
||||
if (0 > (ret = orte_rml.send_buffer_nb(sender, answer, ORTE_RML_TAG_CONFIRM_SPAWN,
|
||||
orte_rml_send_callback, NULL))) {
|
||||
|
||||
/* look up job data object */
|
||||
if (NULL == (jdata = orte_get_job_data_object(job))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
/* mark the job as (being) cancelled so that we can distinguish it later */
|
||||
if (ORTE_SUCCESS != (ret = orte_set_attribute(&jdata->attributes, ORTE_JOB_CANCELLED,
|
||||
ORTE_ATTR_LOCAL, NULL, OPAL_BOOL))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(answer);
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (ret = orte_plm.terminate_job(job))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
goto CLEANUP;
|
||||
}
|
||||
break;
|
||||
|
||||
@ -1165,8 +1103,6 @@ static char *get_orted_comm_cmd_str(int command)
|
||||
return strdup("ORTE_DAEMON_EXIT_CMD");
|
||||
case ORTE_DAEMON_PROCESS_AND_RELAY_CMD:
|
||||
return strdup("ORTE_DAEMON_PROCESS_AND_RELAY_CMD");
|
||||
case ORTE_DAEMON_MESSAGE_LOCAL_PROCS:
|
||||
return strdup("ORTE_DAEMON_MESSAGE_LOCAL_PROCS");
|
||||
case ORTE_DAEMON_NULL_CMD:
|
||||
return strdup("NULL");
|
||||
|
||||
@ -1185,6 +1121,9 @@ static char *get_orted_comm_cmd_str(int command)
|
||||
return strdup("ORTE_DAEMON_HALT_VM_CMD");
|
||||
case ORTE_DAEMON_HALT_DVM_CMD:
|
||||
return strdup("ORTE_DAEMON_HALT_DVM_CMD");
|
||||
case ORTE_DAEMON_REPORT_JOB_COMPLETE:
|
||||
return strdup("ORTE_DAEMON_REPORT_JOB_COMPLETE");
|
||||
|
||||
case ORTE_DAEMON_TOP_CMD:
|
||||
return strdup("ORTE_DAEMON_TOP_CMD");
|
||||
case ORTE_DAEMON_NAME_REQ_CMD:
|
||||
@ -1198,8 +1137,11 @@ static char *get_orted_comm_cmd_str(int command)
|
||||
return strdup("ORTE_DAEMON_PROCESS_CMD");
|
||||
case ORTE_DAEMON_ABORT_PROCS_CALLED:
|
||||
return strdup("ORTE_DAEMON_ABORT_PROCS_CALLED");
|
||||
case ORTE_DAEMON_NEW_COLL_ID:
|
||||
return strdup("ORTE_DAEMON_NEW_COLL_ID");
|
||||
|
||||
case ORTE_DAEMON_DVM_NIDMAP_CMD:
|
||||
return strdup("ORTE_DAEMON_DVM_NIDMAP_CMD");
|
||||
case ORTE_DAEMON_DVM_ADD_PROCS:
|
||||
return strdup("ORTE_DAEMON_DVM_ADD_PROCS");
|
||||
|
||||
default:
|
||||
return strdup("Unknown Command!");
|
||||
|
1808
orte/orted/orted_submit.c
Обычный файл
1808
orte/orted/orted_submit.c
Обычный файл
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
35
orte/orted/orted_submit.h
Обычный файл
35
orte/orted/orted_submit.h
Обычный файл
@ -0,0 +1,35 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2016 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef ORTED_SUBMIT_H
|
||||
#define ORTED_SUBMIT_H
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "orte/mca/plm/plm.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
|
||||
typedef void (*orte_submit_cbfunc_t)(int index, orte_job_t *jdata, int ret, void *cbdata);
|
||||
|
||||
ORTE_DECLSPEC int orte_submit_init(int argc, char *argv[],
|
||||
opal_cmd_line_t *opts);
|
||||
ORTE_DECLSPEC int orte_submit_cancel(int index);
|
||||
ORTE_DECLSPEC void orte_submit_finalize(void);
|
||||
ORTE_DECLSPEC int orte_submit_job(char *cmd[], int *index,
|
||||
orte_submit_cbfunc_t launch_cb, void *launch_cbdata,
|
||||
orte_submit_cbfunc_t complete_cb, void *complete_cbdata);
|
||||
ORTE_DECLSPEC int orte_submit_halt(void);
|
||||
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* ORTED_SUBMIT_H */
|
@ -212,6 +212,11 @@ bool orte_in_parallel_debugger = false;
|
||||
|
||||
char *orte_daemon_cores = NULL;
|
||||
|
||||
/**
|
||||
* Global struct for catching orte command line options.
|
||||
*/
|
||||
orte_cmd_line_t orte_cmd_line = {0};
|
||||
|
||||
int orte_dt_init(void)
|
||||
{
|
||||
int rc;
|
||||
|
@ -412,6 +412,42 @@ typedef struct {
|
||||
} orte_topology_t;
|
||||
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_topology_t);
|
||||
|
||||
/**
|
||||
* Global struct for catching orte command line options.
|
||||
*/
|
||||
struct orte_cmd_line_t {
|
||||
bool help;
|
||||
bool version;
|
||||
bool verbose;
|
||||
char *report_pid;
|
||||
char *report_uri;
|
||||
bool exit;
|
||||
bool debugger;
|
||||
int num_procs;
|
||||
char *env_val;
|
||||
char *appfile;
|
||||
char *wdir;
|
||||
bool set_cwd_to_session_dir;
|
||||
char *path;
|
||||
char *preload_files;
|
||||
bool sleep;
|
||||
char *stdin_target;
|
||||
char *prefix;
|
||||
char *path_to_mpirun;
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
char *sstore_load;
|
||||
#endif
|
||||
bool disable_recovery;
|
||||
bool preload_binaries;
|
||||
bool index_argv;
|
||||
bool run_as_root;
|
||||
char *personality;
|
||||
bool create_dvm;
|
||||
bool terminate_dvm;
|
||||
};
|
||||
typedef struct orte_cmd_line_t orte_cmd_line_t;
|
||||
ORTE_DECLSPEC extern orte_cmd_line_t orte_cmd_line;
|
||||
|
||||
/**
|
||||
* Get a job data object
|
||||
* We cannot just reference a job data object with its jobid as
|
||||
|
@ -139,6 +139,193 @@ void orte_quit(int fd, short args, void *cbdata)
|
||||
opal_event_base_loopbreak(orte_event_base);
|
||||
}
|
||||
|
||||
int orte_print_aborted_job(orte_job_t *job,
|
||||
orte_app_context_t *approc,
|
||||
orte_proc_t *proc,
|
||||
orte_node_t *node)
|
||||
{
|
||||
if (ORTE_JOB_STATE_FAILED_TO_START == job->state ||
|
||||
ORTE_JOB_STATE_FAILED_TO_LAUNCH == job->state) {
|
||||
switch (proc->exit_code) {
|
||||
case ORTE_ERR_SILENT:
|
||||
/* say nothing - it was already reported */
|
||||
break;
|
||||
case ORTE_ERR_SYS_LIMITS_PIPES:
|
||||
orte_show_help("help-orterun.txt", "orterun:sys-limit-pipe", true,
|
||||
orte_basename, node->name,
|
||||
(unsigned long)proc->name.vpid);
|
||||
break;
|
||||
case ORTE_ERR_PIPE_SETUP_FAILURE:
|
||||
orte_show_help("help-orterun.txt", "orterun:pipe-setup-failure", true,
|
||||
orte_basename, node->name,
|
||||
(unsigned long)proc->name.vpid);
|
||||
break;
|
||||
case ORTE_ERR_SYS_LIMITS_CHILDREN:
|
||||
orte_show_help("help-orterun.txt", "orterun:sys-limit-children", true,
|
||||
orte_basename, node->name,
|
||||
(unsigned long)proc->name.vpid);
|
||||
break;
|
||||
case ORTE_ERR_FAILED_GET_TERM_ATTRS:
|
||||
orte_show_help("help-orterun.txt", "orterun:failed-term-attrs", true,
|
||||
orte_basename, node->name,
|
||||
(unsigned long)proc->name.vpid);
|
||||
break;
|
||||
case ORTE_ERR_WDIR_NOT_FOUND:
|
||||
orte_show_help("help-orterun.txt", "orterun:wdir-not-found", true,
|
||||
orte_basename, approc->cwd,
|
||||
node->name, (unsigned long)proc->name.vpid);
|
||||
break;
|
||||
case ORTE_ERR_EXE_NOT_FOUND:
|
||||
orte_show_help("help-orterun.txt", "orterun:exe-not-found", true,
|
||||
orte_basename,
|
||||
(unsigned long)proc->name.vpid,
|
||||
orte_basename,
|
||||
orte_basename,
|
||||
node->name,
|
||||
approc->app);
|
||||
break;
|
||||
case ORTE_ERR_EXE_NOT_ACCESSIBLE:
|
||||
orte_show_help("help-orterun.txt", "orterun:exe-not-accessible", true,
|
||||
orte_basename, approc->app, node->name,
|
||||
(unsigned long)proc->name.vpid);
|
||||
break;
|
||||
case ORTE_ERR_MULTIPLE_AFFINITIES:
|
||||
orte_show_help("help-orterun.txt",
|
||||
"orterun:multiple-paffinity-schemes", true, NULL);
|
||||
break;
|
||||
case ORTE_ERR_TOPO_SLOT_LIST_NOT_SUPPORTED:
|
||||
orte_show_help("help-orterun.txt",
|
||||
"orterun:topo-not-supported",
|
||||
true, orte_process_info.nodename, "rankfile containing a slot_list of ",
|
||||
NULL, approc->app);
|
||||
break;
|
||||
case ORTE_ERR_INVALID_NODE_RANK:
|
||||
orte_show_help("help-orterun.txt",
|
||||
"orterun:invalid-node-rank", true);
|
||||
break;
|
||||
case ORTE_ERR_INVALID_LOCAL_RANK:
|
||||
orte_show_help("help-orterun.txt",
|
||||
"orterun:invalid-local-rank", true);
|
||||
break;
|
||||
case ORTE_ERR_NOT_ENOUGH_CORES:
|
||||
orte_show_help("help-orterun.txt",
|
||||
"orterun:not-enough-resources", true,
|
||||
"sockets", node->name,
|
||||
"bind-to-core", approc->app);
|
||||
break;
|
||||
case ORTE_ERR_TOPO_CORE_NOT_SUPPORTED:
|
||||
orte_show_help("help-orterun.txt",
|
||||
"orterun:topo-not-supported",
|
||||
true, node->name, "bind-to-core", "",
|
||||
approc->app);
|
||||
break;
|
||||
case ORTE_ERR_INVALID_PHYS_CPU:
|
||||
orte_show_help("help-orterun.txt",
|
||||
"orterun:invalid-phys-cpu", true);
|
||||
break;
|
||||
case ORTE_ERR_NOT_ENOUGH_SOCKETS:
|
||||
orte_show_help("help-orterun.txt",
|
||||
"orterun:not-enough-resources", true,
|
||||
"sockets", node->name,
|
||||
"bind-to-socket", approc->app);
|
||||
break;
|
||||
case ORTE_ERR_TOPO_SOCKET_NOT_SUPPORTED:
|
||||
orte_show_help("help-orterun.txt",
|
||||
"orterun:topo-not-supported",
|
||||
true, node->name, "bind-to-socket", "",
|
||||
approc->app);
|
||||
break;
|
||||
case ORTE_ERR_MODULE_NOT_FOUND:
|
||||
orte_show_help("help-orterun.txt",
|
||||
"orterun:paffinity-missing-module",
|
||||
true, node->name);
|
||||
break;
|
||||
case ORTE_ERR_SLOT_LIST_RANGE:
|
||||
orte_show_help("help-orterun.txt",
|
||||
"orterun:invalid-slot-list-range",
|
||||
true, node->name, NULL);
|
||||
break;
|
||||
case ORTE_ERR_PIPE_READ_FAILURE:
|
||||
orte_show_help("help-orterun.txt", "orterun:pipe-read-failure", true,
|
||||
orte_basename, node->name, (unsigned long)proc->name.vpid);
|
||||
break;
|
||||
case ORTE_ERR_SOCKET_NOT_AVAILABLE:
|
||||
orte_show_help("help-orterun.txt", "orterun:proc-socket-not-avail", true,
|
||||
orte_basename, ORTE_ERROR_NAME(proc->exit_code), node->name,
|
||||
(unsigned long)proc->name.vpid);
|
||||
break;
|
||||
|
||||
default:
|
||||
if (0 != proc->exit_code) {
|
||||
orte_show_help("help-orterun.txt", "orterun:proc-failed-to-start", true,
|
||||
orte_basename, ORTE_ERROR_NAME(proc->exit_code), node->name,
|
||||
(unsigned long)proc->name.vpid);
|
||||
} else {
|
||||
orte_show_help("help-orterun.txt", "orterun:proc-failed-to-start-no-status", true,
|
||||
orte_basename, node->name);
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
} else if (ORTE_JOB_STATE_ABORTED == job->state) {
|
||||
orte_show_help("help-orterun.txt", "orterun:proc-ordered-abort", true,
|
||||
orte_basename, (unsigned long)proc->name.vpid, (unsigned long)proc->pid,
|
||||
node->name, orte_basename);
|
||||
return ORTE_SUCCESS;
|
||||
} else if (ORTE_JOB_STATE_ABORTED_BY_SIG == job->state) { /* aborted by signal */
|
||||
#ifdef HAVE_STRSIGNAL
|
||||
if (NULL != strsignal(WTERMSIG(proc->exit_code))) {
|
||||
orte_show_help("help-orterun.txt", "orterun:proc-aborted-strsignal", true,
|
||||
orte_basename, (unsigned long)proc->name.vpid, (unsigned long)proc->pid,
|
||||
node->name, WTERMSIG(proc->exit_code),
|
||||
strsignal(WTERMSIG(proc->exit_code)));
|
||||
} else {
|
||||
#endif
|
||||
orte_show_help("help-orterun.txt", "orterun:proc-aborted", true,
|
||||
orte_basename, (unsigned long)proc->name.vpid, (unsigned long)proc->pid,
|
||||
node->name, WTERMSIG(proc->exit_code));
|
||||
#ifdef HAVE_STRSIGNAL
|
||||
}
|
||||
#endif
|
||||
return ORTE_SUCCESS;
|
||||
} else if (ORTE_JOB_STATE_ABORTED_WO_SYNC == job->state) { /* proc exited w/o finalize */
|
||||
orte_show_help("help-orterun.txt", "orterun:proc-exit-no-sync", true,
|
||||
orte_basename, (unsigned long)proc->name.vpid, (unsigned long)proc->pid,
|
||||
node->name, orte_basename, orte_basename);
|
||||
return ORTE_SUCCESS;
|
||||
} else if (ORTE_JOB_STATE_COMM_FAILED == job->state) {
|
||||
orte_show_help("help-orterun.txt", "orterun:proc-comm-failed", true,
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc->name), node->name);
|
||||
return ORTE_SUCCESS;
|
||||
} else if (ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED == job->state) {
|
||||
switch (proc->exit_code) {
|
||||
case ORTE_ERR_MEM_LIMIT_EXCEEDED:
|
||||
orte_show_help("help-orterun.txt", "orterun:proc-mem-exceeded", true,
|
||||
ORTE_NAME_PRINT(&proc->name), node->name);
|
||||
break;
|
||||
case ORTE_ERR_PROC_STALLED:
|
||||
orte_show_help("help-orterun.txt", "orterun:proc-stalled", true);
|
||||
break;
|
||||
|
||||
default:
|
||||
orte_show_help("help-orterun.txt", "orterun:proc-sensor-exceeded", true);
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
} else if (ORTE_JOB_STATE_HEARTBEAT_FAILED == job->state) {
|
||||
orte_show_help("help-orterun.txt", "orterun:proc-heartbeat-failed", true,
|
||||
orte_basename, ORTE_NAME_PRINT(&proc->name), node->name);
|
||||
return ORTE_SUCCESS;
|
||||
} else if (orte_abort_non_zero_exit &&
|
||||
ORTE_JOB_STATE_NON_ZERO_TERM == job->state) {
|
||||
orte_show_help("help-orterun.txt", "orterun:non-zero-exit", true,
|
||||
orte_basename, ORTE_NAME_PRINT(&proc->name), proc->exit_code);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* nothing here */
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
|
||||
/*
|
||||
* On abnormal termination - dump the
|
||||
* exit status of the aborted procs.
|
||||
@ -146,10 +333,11 @@ void orte_quit(int fd, short args, void *cbdata)
|
||||
|
||||
static void dump_aborted_procs(void)
|
||||
{
|
||||
orte_std_cntr_t i, n;
|
||||
orte_std_cntr_t n;
|
||||
orte_job_t *job;
|
||||
orte_std_cntr_t i;
|
||||
orte_proc_t *proc, *pptr;
|
||||
orte_app_context_t *approc;
|
||||
orte_job_t *job;
|
||||
orte_node_t *node;
|
||||
|
||||
/* find the job that caused the problem - be sure to start the loop
|
||||
@ -161,6 +349,7 @@ static void dump_aborted_procs(void)
|
||||
/* the array is no longer left-justified, so we have to continue */
|
||||
continue;
|
||||
}
|
||||
|
||||
if (ORTE_JOB_STATE_UNDEF != job->state &&
|
||||
ORTE_JOB_STATE_INIT != job->state &&
|
||||
ORTE_JOB_STATE_RUNNING != job->state &&
|
||||
@ -171,7 +360,7 @@ static void dump_aborted_procs(void)
|
||||
for (i=0; i < job->procs->size; i++) {
|
||||
if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(job->procs, i))) {
|
||||
/* array is left-justfied - we are done */
|
||||
continue;
|
||||
break;
|
||||
}
|
||||
if (ORTE_PROC_STATE_FAILED_TO_START == pptr->state ||
|
||||
ORTE_PROC_STATE_FAILED_TO_LAUNCH == pptr->state) {
|
||||
@ -185,7 +374,7 @@ static void dump_aborted_procs(void)
|
||||
}
|
||||
}
|
||||
|
||||
/* this is a guilty party */
|
||||
/* see if there is a guilty party */
|
||||
proc = NULL;
|
||||
if (!orte_get_attribute(&job->attributes, ORTE_JOB_ABORTED_PROC, (void**)&proc, OPAL_PTR) ||
|
||||
NULL == proc) {
|
||||
@ -194,178 +383,9 @@ static void dump_aborted_procs(void)
|
||||
|
||||
approc = (orte_app_context_t*)opal_pointer_array_get_item(job->apps, proc->app_idx);
|
||||
node = proc->node;
|
||||
if (ORTE_JOB_STATE_FAILED_TO_START == job->state ||
|
||||
ORTE_JOB_STATE_FAILED_TO_LAUNCH == job->state) {
|
||||
switch (proc->exit_code) {
|
||||
case ORTE_ERR_SILENT:
|
||||
/* say nothing - it was already reported */
|
||||
break;
|
||||
case ORTE_ERR_SYS_LIMITS_PIPES:
|
||||
orte_show_help("help-orterun.txt", "orterun:sys-limit-pipe", true,
|
||||
orte_basename, proc->node->name,
|
||||
(unsigned long)proc->name.vpid);
|
||||
break;
|
||||
case ORTE_ERR_PIPE_SETUP_FAILURE:
|
||||
orte_show_help("help-orterun.txt", "orterun:pipe-setup-failure", true,
|
||||
orte_basename, proc->node->name,
|
||||
(unsigned long)proc->name.vpid);
|
||||
break;
|
||||
case ORTE_ERR_SYS_LIMITS_CHILDREN:
|
||||
orte_show_help("help-orterun.txt", "orterun:sys-limit-children", true,
|
||||
orte_basename, proc->node->name,
|
||||
(unsigned long)proc->name.vpid);
|
||||
break;
|
||||
case ORTE_ERR_FAILED_GET_TERM_ATTRS:
|
||||
orte_show_help("help-orterun.txt", "orterun:failed-term-attrs", true,
|
||||
orte_basename, proc->node->name,
|
||||
(unsigned long)proc->name.vpid);
|
||||
break;
|
||||
case ORTE_ERR_WDIR_NOT_FOUND:
|
||||
orte_show_help("help-orterun.txt", "orterun:wdir-not-found", true,
|
||||
orte_basename, approc->cwd,
|
||||
proc->node->name, (unsigned long)proc->name.vpid);
|
||||
break;
|
||||
case ORTE_ERR_EXE_NOT_FOUND:
|
||||
orte_show_help("help-orterun.txt", "orterun:exe-not-found", true,
|
||||
orte_basename,
|
||||
(unsigned long)proc->name.vpid,
|
||||
orte_basename,
|
||||
orte_basename,
|
||||
proc->node->name,
|
||||
approc->app);
|
||||
break;
|
||||
case ORTE_ERR_EXE_NOT_ACCESSIBLE:
|
||||
orte_show_help("help-orterun.txt", "orterun:exe-not-accessible", true,
|
||||
orte_basename, approc->app, proc->node->name,
|
||||
(unsigned long)proc->name.vpid);
|
||||
break;
|
||||
case ORTE_ERR_MULTIPLE_AFFINITIES:
|
||||
orte_show_help("help-orterun.txt",
|
||||
"orterun:multiple-paffinity-schemes", true, NULL);
|
||||
break;
|
||||
case ORTE_ERR_TOPO_SLOT_LIST_NOT_SUPPORTED:
|
||||
orte_show_help("help-orterun.txt",
|
||||
"orterun:topo-not-supported",
|
||||
true, orte_process_info.nodename, "rankfile containing a slot_list of ",
|
||||
NULL, approc->app);
|
||||
break;
|
||||
case ORTE_ERR_INVALID_NODE_RANK:
|
||||
orte_show_help("help-orterun.txt",
|
||||
"orterun:invalid-node-rank", true);
|
||||
break;
|
||||
case ORTE_ERR_INVALID_LOCAL_RANK:
|
||||
orte_show_help("help-orterun.txt",
|
||||
"orterun:invalid-local-rank", true);
|
||||
break;
|
||||
case ORTE_ERR_NOT_ENOUGH_CORES:
|
||||
orte_show_help("help-orterun.txt",
|
||||
"orterun:not-enough-resources", true,
|
||||
"sockets", node->name,
|
||||
"bind-to-core", approc->app);
|
||||
break;
|
||||
case ORTE_ERR_TOPO_CORE_NOT_SUPPORTED:
|
||||
orte_show_help("help-orterun.txt",
|
||||
"orterun:topo-not-supported",
|
||||
true, node->name, "bind-to-core", "",
|
||||
approc->app);
|
||||
break;
|
||||
case ORTE_ERR_INVALID_PHYS_CPU:
|
||||
orte_show_help("help-orterun.txt",
|
||||
"orterun:invalid-phys-cpu", true);
|
||||
break;
|
||||
case ORTE_ERR_NOT_ENOUGH_SOCKETS:
|
||||
orte_show_help("help-orterun.txt",
|
||||
"orterun:not-enough-resources", true,
|
||||
"sockets", node->name,
|
||||
"bind-to-socket", approc->app);
|
||||
break;
|
||||
case ORTE_ERR_TOPO_SOCKET_NOT_SUPPORTED:
|
||||
orte_show_help("help-orterun.txt",
|
||||
"orterun:topo-not-supported",
|
||||
true, node->name, "bind-to-socket", "",
|
||||
approc->app);
|
||||
break;
|
||||
case ORTE_ERR_MODULE_NOT_FOUND:
|
||||
orte_show_help("help-orterun.txt",
|
||||
"orterun:paffinity-missing-module",
|
||||
true, node->name);
|
||||
break;
|
||||
case ORTE_ERR_SLOT_LIST_RANGE:
|
||||
orte_show_help("help-orterun.txt",
|
||||
"orterun:invalid-slot-list-range",
|
||||
true, node->name, NULL);
|
||||
break;
|
||||
case ORTE_ERR_PIPE_READ_FAILURE:
|
||||
orte_show_help("help-orterun.txt", "orterun:pipe-read-failure", true,
|
||||
orte_basename, node->name, (unsigned long)proc->name.vpid);
|
||||
break;
|
||||
case ORTE_ERR_SOCKET_NOT_AVAILABLE:
|
||||
orte_show_help("help-orterun.txt", "orterun:proc-socket-not-avail", true,
|
||||
orte_basename, ORTE_ERROR_NAME(proc->exit_code), node->name,
|
||||
(unsigned long)proc->name.vpid);
|
||||
break;
|
||||
|
||||
default:
|
||||
if (0 != proc->exit_code) {
|
||||
orte_show_help("help-orterun.txt", "orterun:proc-failed-to-start", true,
|
||||
orte_basename, ORTE_ERROR_NAME(proc->exit_code), node->name,
|
||||
(unsigned long)proc->name.vpid);
|
||||
} else {
|
||||
orte_show_help("help-orterun.txt", "orterun:proc-failed-to-start-no-status", true,
|
||||
orte_basename, node->name);
|
||||
}
|
||||
break;
|
||||
}
|
||||
} else if (ORTE_JOB_STATE_ABORTED == job->state) {
|
||||
orte_show_help("help-orterun.txt", "orterun:proc-ordered-abort", true,
|
||||
orte_basename, (unsigned long)proc->name.vpid, (unsigned long)proc->pid,
|
||||
node->name, orte_basename);
|
||||
} else if (ORTE_JOB_STATE_ABORTED_BY_SIG == job->state) { /* aborted by signal */
|
||||
#ifdef HAVE_STRSIGNAL
|
||||
if (NULL != strsignal(WTERMSIG(proc->exit_code))) {
|
||||
orte_show_help("help-orterun.txt", "orterun:proc-aborted-strsignal", true,
|
||||
orte_basename, (unsigned long)proc->name.vpid, (unsigned long)proc->pid,
|
||||
node->name, WTERMSIG(proc->exit_code),
|
||||
strsignal(WTERMSIG(proc->exit_code)));
|
||||
} else {
|
||||
#endif
|
||||
orte_show_help("help-orterun.txt", "orterun:proc-aborted", true,
|
||||
orte_basename, (unsigned long)proc->name.vpid, (unsigned long)proc->pid,
|
||||
node->name, WTERMSIG(proc->exit_code));
|
||||
#ifdef HAVE_STRSIGNAL
|
||||
}
|
||||
#endif
|
||||
} else if (ORTE_JOB_STATE_ABORTED_WO_SYNC == job->state) { /* proc exited w/o finalize */
|
||||
orte_show_help("help-orterun.txt", "orterun:proc-exit-no-sync", true,
|
||||
orte_basename, (unsigned long)proc->name.vpid, (unsigned long)proc->pid,
|
||||
node->name, orte_basename, orte_basename);
|
||||
} else if (ORTE_JOB_STATE_COMM_FAILED == job->state) {
|
||||
orte_show_help("help-orterun.txt", "orterun:proc-comm-failed", true,
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc->name), node->name);
|
||||
} else if (ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED == job->state) {
|
||||
switch (proc->exit_code) {
|
||||
case ORTE_ERR_MEM_LIMIT_EXCEEDED:
|
||||
orte_show_help("help-orterun.txt", "orterun:proc-mem-exceeded", true,
|
||||
ORTE_NAME_PRINT(&proc->name), node->name);
|
||||
break;
|
||||
case ORTE_ERR_PROC_STALLED:
|
||||
orte_show_help("help-orterun.txt", "orterun:proc-stalled", true);
|
||||
break;
|
||||
|
||||
default:
|
||||
orte_show_help("help-orterun.txt", "orterun:proc-sensor-exceeded", true);
|
||||
break;
|
||||
}
|
||||
} else if (ORTE_JOB_STATE_HEARTBEAT_FAILED == job->state) {
|
||||
orte_show_help("help-orterun.txt", "orterun:proc-heartbeat-failed", true,
|
||||
orte_basename, ORTE_NAME_PRINT(&proc->name), node->name);
|
||||
} else if (orte_abort_non_zero_exit &&
|
||||
ORTE_JOB_STATE_NON_ZERO_TERM == job->state) {
|
||||
orte_show_help("help-orterun.txt", "orterun:non-zero-exit", true,
|
||||
orte_basename, ORTE_NAME_PRINT(&proc->name), proc->exit_code);
|
||||
if (ORTE_SUCCESS == orte_print_aborted_job(job, approc, proc, node)) {
|
||||
break;
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -2,6 +2,7 @@
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2016 Intel, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -20,10 +21,17 @@
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
ORTE_DECLSPEC void orte_quit(int fd, short args, void *cbdata);
|
||||
|
||||
ORTE_DECLSPEC int orte_print_aborted_job(orte_job_t *job,
|
||||
orte_app_context_t *approc,
|
||||
orte_proc_t *proc,
|
||||
orte_node_t *node);
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* ORTE_CR_H */
|
||||
|
@ -14,7 +14,7 @@
|
||||
* Copyright (c) 2007-2009 Sun Microsystems, Inc. All rights reserved.
|
||||
* Copyright (c) 2007-2013 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -74,6 +74,7 @@
|
||||
#include "opal/class/opal_pointer_array.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/odls/odls.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/rml/base/rml_contact.h"
|
||||
#include "orte/mca/state/state.h"
|
||||
@ -129,6 +130,31 @@ static opal_cmd_line_init_t cmd_line_init[] = {
|
||||
&myglobals.run_as_root, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Allow execution as root (STRONGLY DISCOURAGED)" },
|
||||
|
||||
/* Specify the launch agent to be used */
|
||||
{ "orte_launch_agent", '\0', "launch-agent", "launch-agent", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Command used to start processes on remote nodes (default: orted)" },
|
||||
|
||||
/* maximum size of VM - typically used to subdivide an allocation */
|
||||
{ "orte_max_vm_size", '\0', "max-vm-size", "max-vm-size", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_INT,
|
||||
"Maximum size of VM" },
|
||||
|
||||
/* Set a hostfile */
|
||||
{ NULL, '\0', "hostfile", "hostfile", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Provide a hostfile" },
|
||||
{ NULL, '\0', "machinefile", "machinefile", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Provide a hostfile" },
|
||||
{ "orte_default_hostfile", '\0', "default-hostfile", "default-hostfile", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Provide a default hostfile" },
|
||||
|
||||
{ NULL, 'H', "host", "host", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"List of hosts to invoke processes on" },
|
||||
|
||||
/* End of list */
|
||||
{ NULL, '\0', NULL, NULL, 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_NULL, NULL }
|
||||
@ -454,24 +480,48 @@ static void send_callback(int status, orte_process_name_t *peer,
|
||||
opal_pointer_array_set_item(orte_job_data, ORTE_LOCAL_JOBID(jdata->jobid), NULL);
|
||||
OBJ_RELEASE(jdata);
|
||||
}
|
||||
|
||||
static void notify_requestor(int sd, short args, void *cbdata)
|
||||
{
|
||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||
orte_job_t *jdata = caddy->jdata;
|
||||
orte_proc_t *pptr;
|
||||
int ret;
|
||||
int ret, id, *idptr;
|
||||
opal_buffer_t *reply;
|
||||
|
||||
/* notify the requestor */
|
||||
reply = OBJ_NEW(opal_buffer_t);
|
||||
|
||||
/* see if there was any problem */
|
||||
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, (void**)&pptr, OPAL_PTR) && NULL != pptr) {
|
||||
ret = pptr->exit_code;
|
||||
/* or whether we got cancelled by the user */
|
||||
} else if (orte_get_attribute(&jdata->attributes, ORTE_JOB_CANCELLED, NULL, OPAL_BOOL)) {
|
||||
ret = ORTE_ERR_JOB_CANCELLED;
|
||||
} else {
|
||||
ret = 0;
|
||||
}
|
||||
/* return the completion status */
|
||||
opal_dss.pack(reply, &ret, 1, OPAL_INT);
|
||||
orte_rml.send_buffer_nb(&jdata->originator, reply, ORTE_RML_TAG_TOOL, send_callback, jdata);
|
||||
|
||||
/* pack the jobid to be returned */
|
||||
opal_dss.pack(reply, &jdata->jobid, 1, ORTE_JOBID);
|
||||
|
||||
/* return the tracker ID */
|
||||
idptr = &id;
|
||||
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_ROOM_NUM, (void**)&idptr, OPAL_INT)) {
|
||||
/* pack the sender's index to the tracking object */
|
||||
opal_dss.pack(reply, idptr, 1, OPAL_INT);
|
||||
}
|
||||
|
||||
/* if there was a problem, we need to send the requestor more info about what happened */
|
||||
if (0 < ret) {
|
||||
opal_dss.pack(reply, &jdata->state, 1, ORTE_JOB_STATE_T);
|
||||
opal_dss.pack(reply, &pptr, 1, ORTE_PROC);
|
||||
opal_dss.pack(reply, &pptr->node, 1, ORTE_NODE);
|
||||
}
|
||||
|
||||
orte_rml.send_buffer_nb(&jdata->originator, reply, ORTE_RML_TAG_NOTIFY_COMPLETE, send_callback, jdata);
|
||||
|
||||
/* we cannot cleanup the job object as we might
|
||||
* hit an error during transmission, so clean it
|
||||
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
@ -157,32 +157,27 @@ void* MPIR_Breakpoint(void)
|
||||
static char **global_mca_env = NULL;
|
||||
static orte_std_cntr_t total_num_apps = 0;
|
||||
static bool want_prefix_by_default = (bool) ORTE_WANT_ORTERUN_PREFIX_BY_DEFAULT;
|
||||
|
||||
/*
|
||||
* Globals
|
||||
*/
|
||||
struct orterun_globals_t orterun_globals = {0};
|
||||
static bool globals_init = false;
|
||||
|
||||
static opal_cmd_line_init_t cmd_line_init[] = {
|
||||
/* Various "obvious" options */
|
||||
{ NULL, 'h', NULL, "help", 0,
|
||||
&orterun_globals.help, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
&orte_cmd_line.help, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"This help message" },
|
||||
{ NULL, 'V', NULL, "version", 0,
|
||||
&orterun_globals.version, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
&orte_cmd_line.version, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Print version and exit" },
|
||||
{ NULL, 'v', NULL, "verbose", 0,
|
||||
&orterun_globals.verbose, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
&orte_cmd_line.verbose, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Be verbose" },
|
||||
{ "orte_execute_quiet", 'q', NULL, "quiet", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Suppress helpful messages" },
|
||||
{ NULL, '\0', "report-pid", "report-pid", 1,
|
||||
&orterun_globals.report_pid, OPAL_CMD_LINE_TYPE_STRING,
|
||||
&orte_cmd_line.report_pid, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Printout pid on stdout [-], stderr [+], or a file [anything else]" },
|
||||
{ NULL, '\0', "report-uri", "report-uri", 1,
|
||||
&orterun_globals.report_uri, OPAL_CMD_LINE_TYPE_STRING,
|
||||
&orte_cmd_line.report_uri, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Printout URI on stdout [-], stderr [+], or a file [anything else]" },
|
||||
|
||||
/* exit status reporting */
|
||||
@ -219,12 +214,12 @@ static opal_cmd_line_init_t cmd_line_init[] = {
|
||||
|
||||
/* select stdin option */
|
||||
{ NULL, '\0', "stdin", "stdin", 1,
|
||||
&orterun_globals.stdin_target, OPAL_CMD_LINE_TYPE_STRING,
|
||||
&orte_cmd_line.stdin_target, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Specify procs to receive stdin [rank, all, none] (default: 0, indicating rank 0)" },
|
||||
|
||||
/* request that argv[0] be indexed */
|
||||
{ NULL, '\0', "index-argv-by-rank", "index-argv-by-rank", 0,
|
||||
&orterun_globals.index_argv, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
&orte_cmd_line.index_argv, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Uniquely index argv[0] for each process using its rank" },
|
||||
|
||||
/* Specify the launch agent to be used */
|
||||
@ -234,33 +229,33 @@ static opal_cmd_line_init_t cmd_line_init[] = {
|
||||
|
||||
/* Preload the binary on the remote machine */
|
||||
{ NULL, 's', NULL, "preload-binary", 0,
|
||||
&orterun_globals.preload_binaries, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
&orte_cmd_line.preload_binaries, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Preload the binary on the remote machine before starting the remote process." },
|
||||
|
||||
/* Preload files on the remote machine */
|
||||
{ NULL, '\0', NULL, "preload-files", 1,
|
||||
&orterun_globals.preload_files, OPAL_CMD_LINE_TYPE_STRING,
|
||||
&orte_cmd_line.preload_files, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Preload the comma separated list of files to the remote machines current working directory before starting the remote process." },
|
||||
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
/* Tell SStore to preload a snapshot before launch */
|
||||
{ NULL, '\0', NULL, "sstore-load", 1,
|
||||
&orterun_globals.sstore_load, OPAL_CMD_LINE_TYPE_STRING,
|
||||
&orte_cmd_line.sstore_load, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Internal Use Only! Tell SStore to preload a snapshot before launch." },
|
||||
#endif
|
||||
|
||||
/* Use an appfile */
|
||||
{ NULL, '\0', NULL, "app", 1,
|
||||
&orterun_globals.appfile, OPAL_CMD_LINE_TYPE_STRING,
|
||||
&orte_cmd_line.appfile, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Provide an appfile; ignore all other command line options" },
|
||||
|
||||
/* Number of processes; -c, -n, --n, -np, and --np are all
|
||||
synonyms */
|
||||
{ NULL, 'c', "np", "np", 1,
|
||||
&orterun_globals.num_procs, OPAL_CMD_LINE_TYPE_INT,
|
||||
&orte_cmd_line.num_procs, OPAL_CMD_LINE_TYPE_INT,
|
||||
"Number of processes to run" },
|
||||
{ NULL, '\0', "n", "n", 1,
|
||||
&orterun_globals.num_procs, OPAL_CMD_LINE_TYPE_INT,
|
||||
&orte_cmd_line.num_procs, OPAL_CMD_LINE_TYPE_INT,
|
||||
"Number of processes to run" },
|
||||
|
||||
/* maximum size of VM - typically used to subdivide an allocation */
|
||||
@ -414,30 +409,27 @@ static opal_cmd_line_init_t cmd_line_init[] = {
|
||||
{ "hwloc_base_cpu_set", '\0', "cpu-set", "cpu-set", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Comma-separated list of ranges specifying logical cpus allocated to this job [default: none]"},
|
||||
{ NULL, 'H', "host", "host", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"List of hosts to invoke processes on" },
|
||||
|
||||
/* mpiexec-like arguments */
|
||||
{ NULL, '\0', "wdir", "wdir", 1,
|
||||
&orterun_globals.wdir, OPAL_CMD_LINE_TYPE_STRING,
|
||||
&orte_cmd_line.wdir, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Set the working directory of the started processes" },
|
||||
{ NULL, '\0', "wd", "wd", 1,
|
||||
&orterun_globals.wdir, OPAL_CMD_LINE_TYPE_STRING,
|
||||
&orte_cmd_line.wdir, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Synonym for --wdir" },
|
||||
{ NULL, '\0', "set-cwd-to-session-dir", "set-cwd-to-session-dir", 0,
|
||||
&orterun_globals.set_cwd_to_session_dir, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
&orte_cmd_line.set_cwd_to_session_dir, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Set the working directory of the started processes to their session directory" },
|
||||
{ NULL, '\0', "path", "path", 1,
|
||||
&orterun_globals.path, OPAL_CMD_LINE_TYPE_STRING,
|
||||
&orte_cmd_line.path, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"PATH to be used to look for executables to start processes" },
|
||||
|
||||
/* User-level debugger arguments */
|
||||
{ NULL, '\0', "tv", "tv", 0,
|
||||
&orterun_globals.debugger, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
&orte_cmd_line.debugger, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Deprecated backwards compatibility flag; synonym for \"--debug\"" },
|
||||
{ NULL, '\0', "debug", "debug", 0,
|
||||
&orterun_globals.debugger, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
&orte_cmd_line.debugger, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Invoke the user-level debugger indicated by the orte_base_user_debugger MCA parameter" },
|
||||
{ "orte_base_user_debugger", '\0', "debugger", "debugger", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||
@ -505,7 +497,7 @@ static opal_cmd_line_init_t cmd_line_init[] = {
|
||||
#endif
|
||||
|
||||
{ NULL, '\0', "disable-recovery", "disable-recovery", 0,
|
||||
&orterun_globals.disable_recovery, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
&orte_cmd_line.disable_recovery, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Disable recovery (resets all recovery options to off)" },
|
||||
|
||||
{ "state_novm_select", '\0', "novm", "novm", 0,
|
||||
@ -517,15 +509,15 @@ static opal_cmd_line_init_t cmd_line_init[] = {
|
||||
"Used staged execution if inadequate resources are present (cannot support MPI jobs)" },
|
||||
|
||||
{ NULL, '\0', "allow-run-as-root", "allow-run-as-root", 0,
|
||||
&orterun_globals.run_as_root, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
&orte_cmd_line.run_as_root, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Allow execution as root (STRONGLY DISCOURAGED)" },
|
||||
|
||||
{ NULL, '\0', "personality", "personality", 1,
|
||||
&orterun_globals.personality, OPAL_CMD_LINE_TYPE_STRING,
|
||||
&orte_cmd_line.personality, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Programming model/language being used (default=\"ompi\")" },
|
||||
|
||||
{ NULL, '\0', "dvm", "dvm", 0,
|
||||
&orterun_globals.dvm, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
&orte_cmd_line.create_dvm, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Create a persistent distributed virtual machine (DVM)" },
|
||||
|
||||
/* End of list */
|
||||
@ -631,7 +623,7 @@ int orterun(int argc, char *argv[])
|
||||
|
||||
/* print version if requested. Do this before check for help so
|
||||
that --version --help works as one might expect. */
|
||||
if (orterun_globals.version) {
|
||||
if (orte_cmd_line.version) {
|
||||
char *str, *project_name = NULL;
|
||||
if (0 == strcmp(orte_basename, "mpirun")) {
|
||||
project_name = "Open MPI";
|
||||
@ -655,9 +647,9 @@ int orterun(int argc, char *argv[])
|
||||
* us to proceed if the allow-run-as-root flag was given. Otherwise,
|
||||
* exit with a giant warning flag
|
||||
*/
|
||||
if (0 == geteuid() && !orterun_globals.run_as_root) {
|
||||
if (0 == geteuid() && !orte_cmd_line.run_as_root) {
|
||||
fprintf(stderr, "--------------------------------------------------------------------------\n");
|
||||
if (orterun_globals.help) {
|
||||
if (orte_cmd_line.help) {
|
||||
fprintf(stderr, "%s cannot provide the help message when run as root.\n", orte_basename);
|
||||
} else {
|
||||
/* show_help is not yet available, so print an error manually */
|
||||
@ -699,7 +691,7 @@ int orterun(int argc, char *argv[])
|
||||
}
|
||||
|
||||
/* Check for help request */
|
||||
if (orterun_globals.help) {
|
||||
if (orte_cmd_line.help) {
|
||||
char *str, *args = NULL;
|
||||
char *project_name = NULL;
|
||||
if (0 == strcmp(orte_basename, "mpirun")) {
|
||||
@ -733,8 +725,8 @@ int orterun(int argc, char *argv[])
|
||||
* in the global struct as the app_file parser would replace it.
|
||||
* So handle this specific cmd line option manually.
|
||||
*/
|
||||
orterun_globals.prefix = NULL;
|
||||
orterun_globals.path_to_mpirun = NULL;
|
||||
orte_cmd_line.prefix = NULL;
|
||||
orte_cmd_line.path_to_mpirun = NULL;
|
||||
if (opal_cmd_line_is_taken(&cmd_line, "prefix") ||
|
||||
'/' == argv[0][0] || want_prefix_by_default) {
|
||||
size_t param_len;
|
||||
@ -742,24 +734,24 @@ int orterun(int argc, char *argv[])
|
||||
char* tmp_basename = NULL;
|
||||
/* If they specified an absolute path, strip off the
|
||||
/bin/<exec_name>" and leave just the prefix */
|
||||
orterun_globals.path_to_mpirun = opal_dirname(argv[0]);
|
||||
orte_cmd_line.path_to_mpirun = opal_dirname(argv[0]);
|
||||
/* Quick sanity check to ensure we got
|
||||
something/bin/<exec_name> and that the installation
|
||||
tree is at least more or less what we expect it to
|
||||
be */
|
||||
tmp_basename = opal_basename(orterun_globals.path_to_mpirun);
|
||||
tmp_basename = opal_basename(orte_cmd_line.path_to_mpirun);
|
||||
if (0 == strcmp("bin", tmp_basename)) {
|
||||
char* tmp = orterun_globals.path_to_mpirun;
|
||||
orterun_globals.path_to_mpirun = opal_dirname(tmp);
|
||||
char* tmp = orte_cmd_line.path_to_mpirun;
|
||||
orte_cmd_line.path_to_mpirun = opal_dirname(tmp);
|
||||
free(tmp);
|
||||
} else {
|
||||
free(orterun_globals.path_to_mpirun);
|
||||
orterun_globals.path_to_mpirun = NULL;
|
||||
free(orte_cmd_line.path_to_mpirun);
|
||||
orte_cmd_line.path_to_mpirun = NULL;
|
||||
}
|
||||
free(tmp_basename);
|
||||
}
|
||||
/* if both are given, check to see if they match */
|
||||
if (opal_cmd_line_is_taken(&cmd_line, "prefix") && NULL != orterun_globals.path_to_mpirun) {
|
||||
if (opal_cmd_line_is_taken(&cmd_line, "prefix") && NULL != orte_cmd_line.path_to_mpirun) {
|
||||
char *tmp_basename;
|
||||
/* if they don't match, then that merits a warning */
|
||||
param = strdup(opal_cmd_line_get_param(&cmd_line, "prefix", 0, 0));
|
||||
@ -767,7 +759,7 @@ int orterun(int argc, char *argv[])
|
||||
if (0 == strcmp(OPAL_PATH_SEP, &(param[strlen(param)-1]))) {
|
||||
param[strlen(param)-1] = '\0';
|
||||
}
|
||||
tmp_basename = strdup(orterun_globals.path_to_mpirun);
|
||||
tmp_basename = strdup(orte_cmd_line.path_to_mpirun);
|
||||
if (0 == strcmp(OPAL_PATH_SEP, &(tmp_basename[strlen(tmp_basename)-1]))) {
|
||||
tmp_basename[strlen(tmp_basename)-1] = '\0';
|
||||
}
|
||||
@ -779,12 +771,12 @@ int orterun(int argc, char *argv[])
|
||||
* people can specify the backend prefix as different
|
||||
* from the local one
|
||||
*/
|
||||
free(orterun_globals.path_to_mpirun);
|
||||
orterun_globals.path_to_mpirun = NULL;
|
||||
free(orte_cmd_line.path_to_mpirun);
|
||||
orte_cmd_line.path_to_mpirun = NULL;
|
||||
}
|
||||
free(tmp_basename);
|
||||
} else if (NULL != orterun_globals.path_to_mpirun) {
|
||||
param = strdup(orterun_globals.path_to_mpirun);
|
||||
} else if (NULL != orte_cmd_line.path_to_mpirun) {
|
||||
param = strdup(orte_cmd_line.path_to_mpirun);
|
||||
} else if (opal_cmd_line_is_taken(&cmd_line, "prefix")){
|
||||
/* must be --prefix alone */
|
||||
param = strdup(opal_cmd_line_get_param(&cmd_line, "prefix", 0, 0));
|
||||
@ -807,7 +799,7 @@ int orterun(int argc, char *argv[])
|
||||
}
|
||||
}
|
||||
|
||||
orterun_globals.prefix = param;
|
||||
orte_cmd_line.prefix = param;
|
||||
}
|
||||
want_prefix_by_default = true;
|
||||
}
|
||||
@ -846,8 +838,8 @@ int orterun(int argc, char *argv[])
|
||||
opal_finalize();
|
||||
|
||||
/* default our personality to OMPI */
|
||||
if (NULL == orterun_globals.personality) {
|
||||
orterun_globals.personality = strdup("ompi");
|
||||
if (NULL == orte_cmd_line.personality) {
|
||||
orte_cmd_line.personality = strdup("ompi");
|
||||
}
|
||||
|
||||
/* Check for some "global" command line params */
|
||||
@ -865,19 +857,19 @@ int orterun(int argc, char *argv[])
|
||||
*/
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
jdata->personality = strdup(orterun_globals.personality);
|
||||
jdata->personality = strdup(orte_cmd_line.personality);
|
||||
|
||||
/* check what user wants us to do with stdin */
|
||||
if (0 == strcmp(orterun_globals.stdin_target, "all")) {
|
||||
if (0 == strcmp(orte_cmd_line.stdin_target, "all")) {
|
||||
jdata->stdin_target = ORTE_VPID_WILDCARD;
|
||||
} else if (0 == strcmp(orterun_globals.stdin_target, "none")) {
|
||||
} else if (0 == strcmp(orte_cmd_line.stdin_target, "none")) {
|
||||
jdata->stdin_target = ORTE_VPID_INVALID;
|
||||
} else {
|
||||
jdata->stdin_target = strtoul(orterun_globals.stdin_target, NULL, 10);
|
||||
jdata->stdin_target = strtoul(orte_cmd_line.stdin_target, NULL, 10);
|
||||
}
|
||||
|
||||
/* if we want the argv's indexed, indicate that */
|
||||
if (orterun_globals.index_argv) {
|
||||
if (orte_cmd_line.index_argv) {
|
||||
orte_set_attribute(&jdata->attributes, ORTE_JOB_INDEX_ARGV, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL);
|
||||
}
|
||||
|
||||
@ -906,21 +898,21 @@ int orterun(int argc, char *argv[])
|
||||
daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
|
||||
|
||||
/* check for request to report uri */
|
||||
if (NULL != orterun_globals.report_uri) {
|
||||
if (NULL != orte_cmd_line.report_uri) {
|
||||
FILE *fp;
|
||||
char *rml_uri;
|
||||
rml_uri = orte_rml.get_contact_info();
|
||||
if (0 == strcmp(orterun_globals.report_uri, "-")) {
|
||||
if (0 == strcmp(orte_cmd_line.report_uri, "-")) {
|
||||
/* if '-', then output to stdout */
|
||||
printf("%s\n", (NULL == rml_uri) ? "NULL" : rml_uri);
|
||||
} else if (0 == strcmp(orterun_globals.report_uri, "+")) {
|
||||
} else if (0 == strcmp(orte_cmd_line.report_uri, "+")) {
|
||||
/* if '+', output to stderr */
|
||||
fprintf(stderr, "%s\n", (NULL == rml_uri) ? "NULL" : rml_uri);
|
||||
} else {
|
||||
fp = fopen(orterun_globals.report_uri, "w");
|
||||
fp = fopen(orte_cmd_line.report_uri, "w");
|
||||
if (NULL == fp) {
|
||||
orte_show_help("help-orterun.txt", "orterun:write_file", false,
|
||||
orte_basename, "uri", orterun_globals.report_uri);
|
||||
orte_basename, "uri", orte_cmd_line.report_uri);
|
||||
exit(0);
|
||||
}
|
||||
fprintf(fp, "%s\n", (NULL == rml_uri) ? "NULL" : rml_uri);
|
||||
@ -1104,46 +1096,46 @@ static int init_globals(void)
|
||||
{
|
||||
/* Only CONSTRUCT things once */
|
||||
if (!globals_init) {
|
||||
orterun_globals.env_val = NULL;
|
||||
orterun_globals.appfile = NULL;
|
||||
orterun_globals.wdir = NULL;
|
||||
orterun_globals.path = NULL;
|
||||
orterun_globals.stdin_target = "0";
|
||||
orterun_globals.report_pid = NULL;
|
||||
orterun_globals.report_uri = NULL;
|
||||
orterun_globals.disable_recovery = false;
|
||||
orterun_globals.index_argv = false;
|
||||
orterun_globals.run_as_root = false;
|
||||
orterun_globals.personality = NULL;
|
||||
orterun_globals.dvm = false;
|
||||
orte_cmd_line.env_val = NULL;
|
||||
orte_cmd_line.appfile = NULL;
|
||||
orte_cmd_line.wdir = NULL;
|
||||
orte_cmd_line.path = NULL;
|
||||
orte_cmd_line.stdin_target = "0";
|
||||
orte_cmd_line.report_pid = NULL;
|
||||
orte_cmd_line.report_uri = NULL;
|
||||
orte_cmd_line.disable_recovery = false;
|
||||
orte_cmd_line.index_argv = false;
|
||||
orte_cmd_line.run_as_root = false;
|
||||
orte_cmd_line.personality = NULL;
|
||||
orte_cmd_line.create_dvm = false;
|
||||
}
|
||||
|
||||
/* Reset the other fields every time */
|
||||
|
||||
orterun_globals.help = false;
|
||||
orterun_globals.version = false;
|
||||
orterun_globals.verbose = false;
|
||||
orterun_globals.debugger = false;
|
||||
orterun_globals.num_procs = 0;
|
||||
if( NULL != orterun_globals.env_val )
|
||||
free( orterun_globals.env_val );
|
||||
orterun_globals.env_val = NULL;
|
||||
if( NULL != orterun_globals.appfile )
|
||||
free( orterun_globals.appfile );
|
||||
orterun_globals.appfile = NULL;
|
||||
if( NULL != orterun_globals.wdir )
|
||||
free( orterun_globals.wdir );
|
||||
orterun_globals.set_cwd_to_session_dir = false;
|
||||
orterun_globals.wdir = NULL;
|
||||
if( NULL != orterun_globals.path )
|
||||
free( orterun_globals.path );
|
||||
orterun_globals.path = NULL;
|
||||
orte_cmd_line.help = false;
|
||||
orte_cmd_line.version = false;
|
||||
orte_cmd_line.verbose = false;
|
||||
orte_cmd_line.debugger = false;
|
||||
orte_cmd_line.num_procs = 0;
|
||||
if( NULL != orte_cmd_line.env_val )
|
||||
free( orte_cmd_line.env_val );
|
||||
orte_cmd_line.env_val = NULL;
|
||||
if( NULL != orte_cmd_line.appfile )
|
||||
free( orte_cmd_line.appfile );
|
||||
orte_cmd_line.appfile = NULL;
|
||||
if( NULL != orte_cmd_line.wdir )
|
||||
free( orte_cmd_line.wdir );
|
||||
orte_cmd_line.set_cwd_to_session_dir = false;
|
||||
orte_cmd_line.wdir = NULL;
|
||||
if( NULL != orte_cmd_line.path )
|
||||
free( orte_cmd_line.path );
|
||||
orte_cmd_line.path = NULL;
|
||||
|
||||
orterun_globals.preload_binaries = false;
|
||||
orterun_globals.preload_files = NULL;
|
||||
orte_cmd_line.preload_binaries = false;
|
||||
orte_cmd_line.preload_files = NULL;
|
||||
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
orterun_globals.sstore_load = NULL;
|
||||
orte_cmd_line.sstore_load = NULL;
|
||||
#endif
|
||||
|
||||
/* All done */
|
||||
@ -1155,19 +1147,19 @@ static int init_globals(void)
|
||||
static int parse_globals(int argc, char* argv[], opal_cmd_line_t *cmd_line)
|
||||
{
|
||||
/* check for request to report pid */
|
||||
if (NULL != orterun_globals.report_pid) {
|
||||
if (NULL != orte_cmd_line.report_pid) {
|
||||
FILE *fp;
|
||||
if (0 == strcmp(orterun_globals.report_pid, "-")) {
|
||||
if (0 == strcmp(orte_cmd_line.report_pid, "-")) {
|
||||
/* if '-', then output to stdout */
|
||||
printf("%d\n", (int)getpid());
|
||||
} else if (0 == strcmp(orterun_globals.report_pid, "+")) {
|
||||
} else if (0 == strcmp(orte_cmd_line.report_pid, "+")) {
|
||||
/* if '+', output to stderr */
|
||||
fprintf(stderr, "%d\n", (int)getpid());
|
||||
} else {
|
||||
fp = fopen(orterun_globals.report_pid, "w");
|
||||
fp = fopen(orte_cmd_line.report_pid, "w");
|
||||
if (NULL == fp) {
|
||||
orte_show_help("help-orterun.txt", "orterun:write_file", false,
|
||||
orte_basename, "pid", orterun_globals.report_pid);
|
||||
orte_basename, "pid", orte_cmd_line.report_pid);
|
||||
exit(0);
|
||||
}
|
||||
fprintf(fp, "%d\n", (int)getpid());
|
||||
@ -1177,12 +1169,12 @@ static int parse_globals(int argc, char* argv[], opal_cmd_line_t *cmd_line)
|
||||
|
||||
/* Do we want a user-level debugger? */
|
||||
|
||||
if (orterun_globals.debugger) {
|
||||
run_debugger(orte_basename, cmd_line, argc, argv, orterun_globals.num_procs);
|
||||
if (orte_cmd_line.debugger) {
|
||||
run_debugger(orte_basename, cmd_line, argc, argv, orte_cmd_line.num_procs);
|
||||
}
|
||||
|
||||
/* if recovery was disabled on the cmd line, do so */
|
||||
if (orterun_globals.disable_recovery) {
|
||||
if (orte_cmd_line.disable_recovery) {
|
||||
orte_enable_recovery = false;
|
||||
orte_max_restarts = 0;
|
||||
}
|
||||
@ -1389,8 +1381,8 @@ static int create_app(int argc, char* argv[],
|
||||
* $ mpirun -np 2 -mca foo bar --app launch.appfile
|
||||
* Only pick up '-mca foo bar' on this pass.
|
||||
*/
|
||||
if (NULL != orterun_globals.appfile) {
|
||||
if (ORTE_SUCCESS != (rc = orte_schizo.parse_cli(orterun_globals.personality, argc, 0, argv))) {
|
||||
if (NULL != orte_cmd_line.appfile) {
|
||||
if (ORTE_SUCCESS != (rc = orte_schizo.parse_cli(orte_cmd_line.personality, argc, 0, argv))) {
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
@ -1409,9 +1401,9 @@ static int create_app(int argc, char* argv[],
|
||||
|
||||
/* Is there an appfile in here? */
|
||||
|
||||
if (NULL != orterun_globals.appfile) {
|
||||
if (NULL != orte_cmd_line.appfile) {
|
||||
OBJ_DESTRUCT(&cmd_line);
|
||||
return parse_appfile(jdata, strdup(orterun_globals.appfile), app_env);
|
||||
return parse_appfile(jdata, strdup(orte_cmd_line.appfile), app_env);
|
||||
}
|
||||
|
||||
/* Setup application context */
|
||||
@ -1435,7 +1427,7 @@ static int create_app(int argc, char* argv[],
|
||||
* mpirun -np 2 -mca foo bar ./my-app -mca bip bop
|
||||
* We want to pick up '-mca foo bar' but not '-mca bip bop'
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_schizo.parse_cli(orterun_globals.personality,
|
||||
if (ORTE_SUCCESS != (rc = orte_schizo.parse_cli(orte_cmd_line.personality,
|
||||
argc, count, argv))) {
|
||||
goto cleanup;
|
||||
}
|
||||
@ -1443,8 +1435,8 @@ static int create_app(int argc, char* argv[],
|
||||
/* Grab all OMPI_* environment variables */
|
||||
|
||||
app->env = opal_argv_copy(*app_env);
|
||||
if (ORTE_SUCCESS != (rc = orte_schizo.parse_env(orterun_globals.personality,
|
||||
orterun_globals.path,
|
||||
if (ORTE_SUCCESS != (rc = orte_schizo.parse_env(orte_cmd_line.personality,
|
||||
orte_cmd_line.path,
|
||||
&cmd_line,
|
||||
environ, &app->env))) {
|
||||
goto cleanup;
|
||||
@ -1453,10 +1445,10 @@ static int create_app(int argc, char* argv[],
|
||||
|
||||
/* Did the user request a specific wdir? */
|
||||
|
||||
if (NULL != orterun_globals.wdir) {
|
||||
if (NULL != orte_cmd_line.wdir) {
|
||||
/* if this is a relative path, convert it to an absolute path */
|
||||
if (opal_path_is_absolute(orterun_globals.wdir)) {
|
||||
app->cwd = strdup(orterun_globals.wdir);
|
||||
if (opal_path_is_absolute(orte_cmd_line.wdir)) {
|
||||
app->cwd = strdup(orte_cmd_line.wdir);
|
||||
} else {
|
||||
/* get the cwd */
|
||||
if (OPAL_SUCCESS != (rc = opal_getcwd(cwd, sizeof(cwd)))) {
|
||||
@ -1465,10 +1457,10 @@ static int create_app(int argc, char* argv[],
|
||||
goto cleanup;
|
||||
}
|
||||
/* construct the absolute path */
|
||||
app->cwd = opal_os_path(false, cwd, orterun_globals.wdir, NULL);
|
||||
app->cwd = opal_os_path(false, cwd, orte_cmd_line.wdir, NULL);
|
||||
}
|
||||
orte_set_attribute(&app->attributes, ORTE_APP_USER_CWD, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL);
|
||||
} else if (orterun_globals.set_cwd_to_session_dir) {
|
||||
} else if (orte_cmd_line.set_cwd_to_session_dir) {
|
||||
orte_set_attribute(&app->attributes, ORTE_APP_SSNDIR_CWD, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL);
|
||||
orte_set_attribute(&app->attributes, ORTE_APP_USER_CWD, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL);
|
||||
} else {
|
||||
@ -1500,14 +1492,14 @@ static int create_app(int argc, char* argv[],
|
||||
* given above, check to see if they match
|
||||
*/
|
||||
if (opal_cmd_line_is_taken(&cmd_line, "prefix") &&
|
||||
NULL != orterun_globals.prefix) {
|
||||
NULL != orte_cmd_line.prefix) {
|
||||
/* if they don't match, then that merits a warning */
|
||||
param = strdup(opal_cmd_line_get_param(&cmd_line, "prefix", 0, 0));
|
||||
/* ensure we strip any trailing '/' */
|
||||
if (0 == strcmp(OPAL_PATH_SEP, &(param[strlen(param)-1]))) {
|
||||
param[strlen(param)-1] = '\0';
|
||||
}
|
||||
value = strdup(orterun_globals.prefix);
|
||||
value = strdup(orte_cmd_line.prefix);
|
||||
if (0 == strcmp(OPAL_PATH_SEP, &(value[strlen(value)-1]))) {
|
||||
value[strlen(value)-1] = '\0';
|
||||
}
|
||||
@ -1518,11 +1510,11 @@ static int create_app(int argc, char* argv[],
|
||||
* know that one is being used
|
||||
*/
|
||||
free(param);
|
||||
param = strdup(orterun_globals.prefix);
|
||||
param = strdup(orte_cmd_line.prefix);
|
||||
}
|
||||
free(value);
|
||||
} else if (NULL != orterun_globals.prefix) {
|
||||
param = strdup(orterun_globals.prefix);
|
||||
} else if (NULL != orte_cmd_line.prefix) {
|
||||
param = strdup(orte_cmd_line.prefix);
|
||||
} else if (opal_cmd_line_is_taken(&cmd_line, "prefix")){
|
||||
/* must be --prefix alone */
|
||||
param = strdup(opal_cmd_line_get_param(&cmd_line, "prefix", 0, 0));
|
||||
@ -1592,18 +1584,18 @@ static int create_app(int argc, char* argv[],
|
||||
}
|
||||
|
||||
/* check for bozo error */
|
||||
if (0 > orterun_globals.num_procs) {
|
||||
if (0 > orte_cmd_line.num_procs) {
|
||||
orte_show_help("help-orterun.txt", "orterun:negative-nprocs",
|
||||
true, orte_basename, app->argv[0],
|
||||
orterun_globals.num_procs, NULL);
|
||||
orte_cmd_line.num_procs, NULL);
|
||||
return ORTE_ERR_FATAL;
|
||||
}
|
||||
|
||||
app->num_procs = (orte_std_cntr_t)orterun_globals.num_procs;
|
||||
app->num_procs = (orte_std_cntr_t)orte_cmd_line.num_procs;
|
||||
total_num_apps++;
|
||||
|
||||
/* Capture any preload flags */
|
||||
if (orterun_globals.preload_binaries) {
|
||||
if (orte_cmd_line.preload_binaries) {
|
||||
orte_set_attribute(&app->attributes, ORTE_APP_PRELOAD_BIN, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL);
|
||||
}
|
||||
/* if we were told to cwd to the session dir and the app was given in
|
||||
@ -1614,21 +1606,21 @@ static int create_app(int argc, char* argv[],
|
||||
*/
|
||||
if (!opal_path_is_absolute(app->argv[0]) &&
|
||||
NULL == strstr(app->argv[0], "java")) {
|
||||
if (orterun_globals.preload_binaries) {
|
||||
if (orte_cmd_line.preload_binaries) {
|
||||
orte_set_attribute(&app->attributes, ORTE_APP_SSNDIR_CWD, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL);
|
||||
} else if (orte_get_attribute(&app->attributes, ORTE_APP_SSNDIR_CWD, NULL, OPAL_BOOL)) {
|
||||
orte_set_attribute(&app->attributes, ORTE_APP_PRELOAD_BIN, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL);
|
||||
}
|
||||
}
|
||||
if (NULL != orterun_globals.preload_files) {
|
||||
if (NULL != orte_cmd_line.preload_files) {
|
||||
orte_set_attribute(&app->attributes, ORTE_APP_PRELOAD_FILES, ORTE_ATTR_LOCAL,
|
||||
orterun_globals.preload_files, OPAL_STRING);
|
||||
orte_cmd_line.preload_files, OPAL_STRING);
|
||||
}
|
||||
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
if(NULL != orterun_globals.sstore_load) {
|
||||
if(NULL != orte_cmd_line.sstore_load) {
|
||||
orte_set_attribute(&app->attributes, ORTE_APP_SSTORE_LOAD, ORTE_ATTR_LOCAL,
|
||||
orterun_globals.sstore_load, OPAL_STRING);
|
||||
orte_cmd_line.sstore_load, OPAL_STRING);
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -1837,9 +1829,9 @@ static int parse_appfile(orte_job_t *jdata, char *filename, char ***env)
|
||||
* Make sure to clear out this variable so we don't do anything odd in
|
||||
* app_create()
|
||||
*/
|
||||
if( NULL != orterun_globals.appfile ) {
|
||||
free( orterun_globals.appfile );
|
||||
orterun_globals.appfile = NULL;
|
||||
if( NULL != orte_cmd_line.appfile ) {
|
||||
free( orte_cmd_line.appfile );
|
||||
orte_cmd_line.appfile = NULL;
|
||||
}
|
||||
|
||||
/* Try to open the file */
|
||||
|
@ -32,45 +32,6 @@ BEGIN_C_DECLS
|
||||
*/
|
||||
int orterun(int argc, char *argv[]);
|
||||
|
||||
/**
|
||||
* Global struct for catching orterun command line options.
|
||||
*/
|
||||
struct orterun_globals_t {
|
||||
bool help;
|
||||
bool version;
|
||||
bool verbose;
|
||||
char *report_pid;
|
||||
char *report_uri;
|
||||
bool exit;
|
||||
bool debugger;
|
||||
int num_procs;
|
||||
char *env_val;
|
||||
char *appfile;
|
||||
char *wdir;
|
||||
bool set_cwd_to_session_dir;
|
||||
char *path;
|
||||
char *preload_files;
|
||||
bool sleep;
|
||||
char *stdin_target;
|
||||
char *prefix;
|
||||
char *path_to_mpirun;
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
char *sstore_load;
|
||||
#endif
|
||||
bool disable_recovery;
|
||||
bool preload_binaries;
|
||||
bool index_argv;
|
||||
bool run_as_root;
|
||||
char *personality;
|
||||
bool dvm;
|
||||
};
|
||||
|
||||
/**
|
||||
* Struct holding values gleaned from the orterun command line -
|
||||
* needed by debugger init
|
||||
*/
|
||||
ORTE_DECLSPEC extern struct orterun_globals_t orterun_globals;
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* ORTERUN_ORTERUN_H */
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2014 Intel, Inc. All rights reserved
|
||||
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved
|
||||
* Copyright (c) 2014 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
@ -261,6 +261,10 @@ const char *orte_attr_key_to_str(orte_attribute_key_t key)
|
||||
return "JOB-LAUNCH-PROXY";
|
||||
case ORTE_JOB_NSPACE_REGISTERED:
|
||||
return "JOB-NSPACE-REGISTERED";
|
||||
case ORTE_JOB_FIXED_DVM:
|
||||
return "ORTE-JOB-FIXED-DVM";
|
||||
case ORTE_JOB_DVM_JOB:
|
||||
return "ORTE-JOB-DVM-JOB";
|
||||
|
||||
case ORTE_PROC_NOBARRIER:
|
||||
return "PROC-NOBARRIER";
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved
|
||||
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -130,6 +130,9 @@ typedef uint16_t orte_job_flags_t;
|
||||
#define ORTE_JOB_ROOM_NUM (ORTE_JOB_START_KEY + 39) // int - number of remote request's hotel room
|
||||
#define ORTE_JOB_LAUNCH_PROXY (ORTE_JOB_START_KEY + 40) // opal_process_name_t - name of spawn requestor
|
||||
#define ORTE_JOB_NSPACE_REGISTERED (ORTE_JOB_START_KEY + 41) // bool - job has been registered with embedded PMIx server
|
||||
#define ORTE_JOB_FIXED_DVM (ORTE_JOB_START_KEY + 42) // bool - do not change the size of the DVM for this job
|
||||
#define ORTE_JOB_DVM_JOB (ORTE_JOB_START_KEY + 43) // bool - job is using a DVM
|
||||
#define ORTE_JOB_CANCELLED (ORTE_JOB_START_KEY + 44) // bool - job was cancelled
|
||||
|
||||
#define ORTE_JOB_MAX_KEY 300
|
||||
|
||||
|
@ -54,7 +54,7 @@ int orte_util_add_dash_host_nodes(opal_list_t *nodes,
|
||||
orte_node_t *node, *nd;
|
||||
opal_list_t adds;
|
||||
bool found;
|
||||
int slots;
|
||||
int slots=0;
|
||||
bool slots_given;
|
||||
char *cptr;
|
||||
|
||||
@ -237,8 +237,8 @@ int orte_util_add_dash_host_nodes(opal_list_t *nodes,
|
||||
}
|
||||
node->name = strdup(ndname);
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_ras_base_framework.framework_output,
|
||||
"%s dashhost: added node %s to list",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name));
|
||||
"%s dashhost: added node %s to list - slots %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name, slots));
|
||||
node->state = ORTE_NODE_STATE_UP;
|
||||
node->slots_inuse = 0;
|
||||
node->slots_max = 0;
|
||||
@ -273,6 +273,7 @@ int orte_util_add_dash_host_nodes(opal_list_t *nodes,
|
||||
node->slots = nd->slots;
|
||||
if (ORTE_FLAG_TEST(nd, ORTE_NODE_FLAG_SLOTS_GIVEN)) {
|
||||
ORTE_FLAG_SET(node, ORTE_NODE_FLAG_SLOTS_GIVEN);
|
||||
node->slots = nd->slots;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
@ -231,6 +231,9 @@ int orte_err2str(int errnum, const char **errmsg)
|
||||
case ORTE_ERR_FORCE_SELECT:
|
||||
retval = "Force select";
|
||||
break;
|
||||
case ORTE_ERR_JOB_CANCELLED:
|
||||
retval = "Job cancelled";
|
||||
break;
|
||||
default:
|
||||
if (orte_report_silent_errors) {
|
||||
retval = "Unknown error";
|
||||
|
@ -285,8 +285,9 @@ static int hostfile_parse_line(int token, opal_list_t* updates,
|
||||
free(node_name);
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_ras_base_framework.framework_output,
|
||||
"%s hostfile: node %s slots %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name, node->slots));
|
||||
"%s hostfile: node %s slots %d nodes-given %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name, node->slots,
|
||||
ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_SLOTS_GIVEN) ? "TRUE" : "FALSE"));
|
||||
/* mark the slots as "given" since we take them as being the
|
||||
* number specified via the rankfile
|
||||
*/
|
||||
|
@ -220,7 +220,6 @@ int orte_util_encode_nodemap(opal_byte_object_t *boptr, bool update)
|
||||
}
|
||||
/* if the daemon doesn't have a node, that's an error */
|
||||
if (NULL == (node = dmn->node)) {
|
||||
opal_output(0, "DAEMON %s HAS NO NODE", ORTE_NAME_PRINT(&dmn->name));
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user