Fully support OMPI spawn options. Fix a bug in the round-robin mappers where we weren't adding nodes to the job map node array, and so resources were not released
Signed-off-by: Ralph Castain <rhc@open-mpi.org> (cherry picked from commit 285d8cfef74ffc899e9c51e1d9c597b7fb2ceb89)
Этот коммит содержится в:
родитель
d704712bad
Коммит
fe9b584c05
@ -11,7 +11,7 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
|
||||
* Copyright (c) 2014 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -33,6 +33,16 @@
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
/* Provide a macro for determining the bool value of an opal_value_t */
|
||||
#define OPAL_CHECK_BOOL(v, p) \
|
||||
do { \
|
||||
if (OPAL_UNDEF == (v)->type) { \
|
||||
(p) = true; \
|
||||
} else { \
|
||||
(p) = (v)->data.flag; \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
/* A non-API function for something that happens in a number
|
||||
* of places throughout the code base - loading a value into
|
||||
* an opal_value_t structure
|
||||
|
@ -344,6 +344,21 @@ typedef uint32_t pmix_rank_t;
|
||||
// job - i.e., not part of the "comm_world" of the job
|
||||
#define PMIX_SET_SESSION_CWD "pmix.ssncwd" // (bool) set the application's current working directory to
|
||||
// the session working directory assigned by the RM
|
||||
#define PMIX_TAG_OUTPUT "pmix.tagout" // (bool) tag application output with the ID of the source
|
||||
#define PMIX_TIMESTAMP_OUTPUT "pmix.tsout" // (bool) timestamp output from applications
|
||||
#define PMIX_MERGE_STDERR_STDOUT "pmix.mergeerrout" // (bool) merge stdout and stderr streams from application procs
|
||||
#define PMIX_OUTPUT_TO_FILE "pmix.outfile" // (char*) output application output to given file
|
||||
#define PMIX_INDEX_ARGV "pmix.indxargv" // (bool) mark the argv with the rank of the proc
|
||||
#define PMIX_CPUS_PER_PROC "pmix.cpuperproc" // (uint32_t) #cpus to assign to each rank
|
||||
#define PMIX_NO_PROCS_ON_HEAD "pmix.nolocal" // (bool) do not place procs on the head node
|
||||
#define PMIX_NO_OVERSUBSCRIBE "pmix.noover" // (bool) do not oversubscribe the cpus
|
||||
#define PMIX_REPORT_BINDINGS "pmix.repbind" // (bool) report bindings of the individual procs
|
||||
#define PMIX_CPU_LIST "pmix.cpulist" // (char*) list of cpus to use for this job
|
||||
#define PMIX_JOB_RECOVERABLE "pmix.recover" // (bool) application supports recoverable operations
|
||||
#define PMIX_JOB_CONTINUOUS "pmix.continuous" // (bool) application is continuous, all failed procs should
|
||||
// be immediately restarted
|
||||
#define PMIX_MAX_RESTARTS "pmix.maxrestarts" // (uint32_t) max number of times to restart a job
|
||||
|
||||
|
||||
/* query attributes */
|
||||
#define PMIX_QUERY_NAMESPACES "pmix.qry.ns" // (char*) request a comma-delimited list of active nspaces
|
||||
|
@ -471,7 +471,7 @@ static pmix_status_t parse_uri_file(char *filename,
|
||||
* user isn't authorized to access it - or it may just
|
||||
* not exist yet! Check for existence */
|
||||
if (0 != access(filename, R_OK)) {
|
||||
if (ENOENT == errno) {
|
||||
if (ENOENT == errno && 0 < mca_ptl_tcp_component.wait_to_connect) {
|
||||
/* the file does not exist, so give it
|
||||
* a little time to see if the server
|
||||
* is still starting up */
|
||||
@ -979,6 +979,7 @@ static pmix_status_t df_search(char *dirname, char *prefix,
|
||||
}
|
||||
newdir = pmix_os_path(false, dirname, dir_entry->d_name, NULL);
|
||||
if (-1 == stat(newdir, &buf)) {
|
||||
free(newdir);
|
||||
continue;
|
||||
}
|
||||
/* if it is a directory, down search */
|
||||
|
@ -343,10 +343,12 @@ void pmix2x_server_deregister_nspace(opal_jobid_t jobid,
|
||||
if (jptr->jobid == jobid) {
|
||||
/* found it - tell the server to deregister */
|
||||
OPAL_PMIX_CONSTRUCT_LOCK(&lock);
|
||||
OPAL_PMIX_RELEASE_THREAD(&opal_pmix_base.lock);
|
||||
PMIx_server_deregister_nspace(jptr->nspace, lkcbfunc, (void*)&lock);
|
||||
OPAL_PMIX_WAIT_THREAD(&lock);
|
||||
OPAL_PMIX_DESTRUCT_LOCK(&lock);
|
||||
/* now get rid of it from our list */
|
||||
OPAL_PMIX_ACQUIRE_THREAD(&opal_pmix_base.lock);
|
||||
opal_list_remove_item(&mca_pmix_pmix2x_component.jobids, &jptr->super);
|
||||
OBJ_RELEASE(jptr);
|
||||
break;
|
||||
|
@ -259,6 +259,21 @@ BEGIN_C_DECLS
|
||||
// job - i.e., not part of the "comm_world" of the job
|
||||
#define OPAL_PMIX_SET_SESSION_CWD "pmix.ssncwd" // (bool) set the application's current working directory to
|
||||
// the session working directory assigned by the RM
|
||||
#define OPAL_PMIX_TAG_OUTPUT "pmix.tagout" // (bool) tag application output with the ID of the source
|
||||
#define OPAL_PMIX_TIMESTAMP_OUTPUT "pmix.tsout" // (bool) timestamp output from applications
|
||||
#define OPAL_PMIX_MERGE_STDERR_STDOUT "pmix.mergeerrout" // (bool) merge stdout and stderr streams from application procs
|
||||
#define OPAL_PMIX_OUTPUT_TO_FILE "pmix.outfile" // (char*) output application output to given file
|
||||
#define OPAL_PMIX_INDEX_ARGV "pmix.indxargv" // (bool) mark the argv with the rank of the proc
|
||||
#define OPAL_PMIX_CPUS_PER_PROC "pmix.cpuperproc" // (uint32_t) #cpus to assign to each rank
|
||||
#define OPAL_PMIX_NO_PROCS_ON_HEAD "pmix.nolocal" // (bool) do not place procs on the head node
|
||||
#define OPAL_PMIX_NO_OVERSUBSCRIBE "pmix.noover" // (bool) do not oversubscribe the cpus
|
||||
#define OPAL_PMIX_REPORT_BINDINGS "pmix.repbind" // (bool) report bindings of the individual procs
|
||||
#define OPAL_PMIX_CPU_LIST "pmix.cpulist" // (char*) list of cpus to use for this job
|
||||
#define OPAL_PMIX_JOB_RECOVERABLE "pmix.recover" // (bool) application supports recoverable operations
|
||||
#define OPAL_PMIX_JOB_CONTINUOUS "pmix.continuous" // (bool) application is continuous, all failed procs should
|
||||
// be immediately restarted
|
||||
#define OPAL_PMIX_MAX_RESTARTS "pmix.maxrestarts" // (uint32_t) max number of times to restart a job
|
||||
|
||||
|
||||
/* query attributes */
|
||||
#define OPAL_PMIX_QUERY_NAMESPACES "pmix.qry.ns" // (char*) request a comma-delimited list of active nspaces
|
||||
@ -282,6 +297,7 @@ BEGIN_C_DECLS
|
||||
#define OPAL_PMIX_TIME_REMAINING "pmix.time.remaining" // (char*) query number of seconds (uint32_t) remaining in allocation
|
||||
// for the specified nspace
|
||||
|
||||
|
||||
/* log attributes */
|
||||
#define OPAL_PMIX_LOG_STDERR "pmix.log.stderr" // (char*) log string to stderr
|
||||
#define OPAL_PMIX_LOG_STDOUT "pmix.log.stdout" // (char*) log string to stdout
|
||||
|
@ -96,7 +96,7 @@ static int
|
||||
opal_err2str(int errnum, const char **errmsg)
|
||||
{
|
||||
const char *retval;
|
||||
opal_output(0, "OPAL ERR2STR %d", errnum);
|
||||
|
||||
switch (errnum) {
|
||||
case OPAL_SUCCESS:
|
||||
retval = "Success";
|
||||
|
@ -145,6 +145,8 @@ int orte_rmaps_rr_byslot(orte_job_t *jdata,
|
||||
/* add this node to the map - do it only once */
|
||||
if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_MAPPED)) {
|
||||
ORTE_FLAG_SET(node, ORTE_NODE_FLAG_MAPPED);
|
||||
OBJ_RETAIN(node);
|
||||
opal_pointer_array_add(jdata->map->nodes, node);
|
||||
++(jdata->map->num_nodes);
|
||||
}
|
||||
if (add_one) {
|
||||
@ -284,6 +286,8 @@ int orte_rmaps_rr_bynode(orte_job_t *jdata,
|
||||
/* add this node to the map, but only do so once */
|
||||
if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_MAPPED)) {
|
||||
ORTE_FLAG_SET(node, ORTE_NODE_FLAG_MAPPED);
|
||||
OBJ_RETAIN(node);
|
||||
opal_pointer_array_add(jdata->map->nodes, node);
|
||||
++(jdata->map->num_nodes);
|
||||
}
|
||||
if (oversubscribed) {
|
||||
@ -532,6 +536,8 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata,
|
||||
/* add this node to the map, if reqd */
|
||||
if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_MAPPED)) {
|
||||
ORTE_FLAG_SET(node, ORTE_NODE_FLAG_MAPPED);
|
||||
OBJ_RETAIN(node);
|
||||
opal_pointer_array_add(jdata->map->nodes, node);
|
||||
++(jdata->map->num_nodes);
|
||||
}
|
||||
nmapped = 0;
|
||||
@ -678,6 +684,8 @@ static int byobj_span(orte_job_t *jdata,
|
||||
/* add this node to the map, if reqd */
|
||||
if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_MAPPED)) {
|
||||
ORTE_FLAG_SET(node, ORTE_NODE_FLAG_MAPPED);
|
||||
OBJ_RETAIN(node);
|
||||
opal_pointer_array_add(jdata->map->nodes, node);
|
||||
++(jdata->map->num_nodes);
|
||||
}
|
||||
/* get the number of objects of this type on this node */
|
||||
|
@ -502,11 +502,6 @@ static opal_cmd_line_init_t cmd_line_init[] = {
|
||||
"Create a persistent distributed virtual machine (DVM)",
|
||||
OPAL_CMD_LINE_OTYPE_DVM },
|
||||
|
||||
/* tell the dvm to terminate */
|
||||
{ NULL, '\0', "terminate", "terminate", 0,
|
||||
&orte_cmd_options.terminate_dvm, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Terminate the DVM", OPAL_CMD_LINE_OTYPE_DVM },
|
||||
|
||||
/* fwd mpirun port */
|
||||
{ "orte_fwd_mpirun_port", '\0', "fwd-mpirun-port", "fwd-mpirun-port", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
|
@ -450,7 +450,7 @@ static void check_complete(int fd, short args, void *cbdata)
|
||||
* we call the errmgr so that any attempt to restart the job will
|
||||
* avoid doing so in the exact same place as the current job
|
||||
*/
|
||||
if (NULL != jdata->map && jdata->state == ORTE_JOB_STATE_TERMINATED) {
|
||||
if (NULL != jdata->map) {
|
||||
map = jdata->map;
|
||||
for (index = 0; index < map->nodes->size; index++) {
|
||||
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, index))) {
|
||||
|
@ -820,7 +820,6 @@ int orte_submit_job(char *argv[], int *index,
|
||||
orte_set_attribute(&jdata->attributes, ORTE_JOB_MERGE_STDERR_STDOUT, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL);
|
||||
}
|
||||
|
||||
|
||||
/* check what user wants us to do with stdin */
|
||||
if (NULL != orte_cmd_options.stdin_target) {
|
||||
if (0 == strcmp(orte_cmd_options.stdin_target, "all")) {
|
||||
@ -902,7 +901,7 @@ int orte_submit_job(char *argv[], int *index,
|
||||
ORTE_SET_MAPPING_DIRECTIVE(jdata->map->mapping, ORTE_MAPPING_NO_USE_LOCAL);
|
||||
}
|
||||
if (orte_cmd_options.no_oversubscribe) {
|
||||
ORTE_UNSET_MAPPING_DIRECTIVE(jdata->map->mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE);
|
||||
ORTE_SET_MAPPING_DIRECTIVE(jdata->map->mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE);
|
||||
}
|
||||
if (orte_cmd_options.oversubscribe) {
|
||||
ORTE_UNSET_MAPPING_DIRECTIVE(jdata->map->mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE);
|
||||
|
@ -42,6 +42,7 @@
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/rmaps/base/base.h"
|
||||
#include "orte/mca/state/state.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/threads.h"
|
||||
@ -59,6 +60,7 @@ void pmix_server_launch_resp(int status, orte_process_name_t* sender,
|
||||
int rc, room;
|
||||
int32_t ret, cnt;
|
||||
orte_jobid_t jobid;
|
||||
orte_job_t *jdata;
|
||||
|
||||
/* unpack the status */
|
||||
cnt = 1;
|
||||
@ -93,6 +95,11 @@ void pmix_server_launch_resp(int status, orte_process_name_t* sender,
|
||||
if (NULL != req->spcbfunc) {
|
||||
req->spcbfunc(ret, jobid, req->cbdata);
|
||||
}
|
||||
/* if we failed to launch, then ensure we cleanup */
|
||||
if (ORTE_SUCCESS != ret) {
|
||||
jdata = orte_get_job_data_object(jobid);
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
|
||||
}
|
||||
/* cleanup */
|
||||
OBJ_RELEASE(req);
|
||||
}
|
||||
@ -164,8 +171,9 @@ int pmix_server_spawn_fn(opal_process_name_t *requestor,
|
||||
opal_pmix_app_t *papp;
|
||||
opal_value_t *info, *next;
|
||||
opal_list_t *cache;
|
||||
int rc;
|
||||
int rc, i;
|
||||
char cwd[OPAL_PATH_MAX];
|
||||
bool flag;
|
||||
|
||||
opal_output_verbose(2, orte_pmix_server_globals.output,
|
||||
"%s spawn called from proc %s",
|
||||
@ -176,108 +184,6 @@ int pmix_server_spawn_fn(opal_process_name_t *requestor,
|
||||
jdata = OBJ_NEW(orte_job_t);
|
||||
jdata->map = OBJ_NEW(orte_job_map_t);
|
||||
|
||||
/* transfer the job info across */
|
||||
OPAL_LIST_FOREACH_SAFE(info, next, job_info, opal_value_t) {
|
||||
if (0 == strcmp(info->key, OPAL_PMIX_PERSONALITY)) {
|
||||
jdata->personality = opal_argv_split(info->data.string, ',');
|
||||
} else if (0 == strcmp(info->key, OPAL_PMIX_MAPPER)) {
|
||||
jdata->map->req_mapper = strdup(info->data.string);
|
||||
} else if (0 == strcmp(info->key, OPAL_PMIX_DISPLAY_MAP)) {
|
||||
jdata->map->display_map = true;
|
||||
} else if (0 == strcmp(info->key, OPAL_PMIX_PPR)) {
|
||||
if (ORTE_MAPPING_POLICY_IS_SET(jdata->map->mapping)) {
|
||||
/* not allowed to provide multiple mapping policies */
|
||||
orte_show_help("help-orte-rmaps-base.txt", "redefining-policy",
|
||||
true, "mapping", info->data.string,
|
||||
orte_rmaps_base_print_mapping(orte_rmaps_base.mapping));
|
||||
return ORTE_ERR_BAD_PARAM;
|
||||
}
|
||||
ORTE_SET_MAPPING_DIRECTIVE(jdata->map->mapping, ORTE_MAPPING_PPR);
|
||||
jdata->map->ppr = strdup(info->data.string);
|
||||
} else if (0 == strcmp(info->key, OPAL_PMIX_MAPBY)) {
|
||||
if (ORTE_MAPPING_POLICY_IS_SET(jdata->map->mapping)) {
|
||||
/* not allowed to provide multiple mapping policies */
|
||||
orte_show_help("help-orte-rmaps-base.txt", "redefining-policy",
|
||||
true, "mapping", info->data.string,
|
||||
orte_rmaps_base_print_mapping(orte_rmaps_base.mapping));
|
||||
return ORTE_ERR_BAD_PARAM;
|
||||
}
|
||||
rc = orte_rmaps_base_set_mapping_policy(&jdata->map->mapping,
|
||||
NULL, info->data.string);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
return rc;
|
||||
}
|
||||
} else if (0 == strcmp(info->key, OPAL_PMIX_RANKBY)) {
|
||||
if (ORTE_RANKING_POLICY_IS_SET(jdata->map->ranking)) {
|
||||
/* not allowed to provide multiple ranking policies */
|
||||
orte_show_help("help-orte-rmaps-base.txt", "redefining-policy",
|
||||
true, "ranking", info->data.string,
|
||||
orte_rmaps_base_print_ranking(orte_rmaps_base.ranking));
|
||||
return ORTE_ERR_BAD_PARAM;
|
||||
}
|
||||
rc = orte_rmaps_base_set_ranking_policy(&jdata->map->ranking,
|
||||
jdata->map->mapping,
|
||||
info->data.string);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
return rc;
|
||||
}
|
||||
} else if (0 == strcmp(info->key, OPAL_PMIX_BINDTO)) {
|
||||
if (OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) {
|
||||
/* not allowed to provide multiple mapping policies */
|
||||
orte_show_help("help-opal-hwloc-base.txt", "redefining-policy", true,
|
||||
info->data.string,
|
||||
opal_hwloc_base_print_binding(opal_hwloc_binding_policy));
|
||||
return ORTE_ERR_BAD_PARAM;
|
||||
}
|
||||
rc = opal_hwloc_base_set_binding_policy(&jdata->map->binding,
|
||||
info->data.string);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
return rc;
|
||||
}
|
||||
} else if (0 == strcmp(info->key, OPAL_PMIX_NON_PMI)) {
|
||||
orte_set_attribute(&jdata->attributes, ORTE_JOB_NON_ORTE_JOB,
|
||||
ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL);
|
||||
} else if (0 == strcmp(info->key, OPAL_PMIX_REQUESTOR_IS_TOOL)) {
|
||||
orte_set_attribute(&jdata->attributes, ORTE_JOB_DVM_JOB,
|
||||
ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL);
|
||||
/* request that IO be forwarded to the requesting tool */
|
||||
orte_set_attribute(&jdata->attributes, ORTE_JOB_FWDIO_TO_TOOL,
|
||||
ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL);
|
||||
} else if (0 == strcmp(info->key, OPAL_PMIX_STDIN_TGT)) {
|
||||
if (0 == strcmp(info->data.string, "all")) {
|
||||
jdata->stdin_target = ORTE_VPID_WILDCARD;
|
||||
} else if (0 == strcmp(info->data.string, "none")) {
|
||||
jdata->stdin_target = ORTE_VPID_INVALID;
|
||||
} else {
|
||||
jdata->stdin_target = strtoul(info->data.string, NULL, 10);
|
||||
}
|
||||
} else if (0 == strcmp(info->key, OPAL_PMIX_NOTIFY_COMPLETION)) {
|
||||
if (OPAL_UNDEF == info->type || info->data.flag) {
|
||||
orte_set_attribute(&jdata->attributes, ORTE_JOB_NOTIFY_COMPLETION,
|
||||
ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL);
|
||||
}
|
||||
} else if (0 == strcmp(info->key, OPAL_PMIX_DEBUG_STOP_ON_EXEC)) {
|
||||
/* we don't know how to do this */
|
||||
return ORTE_ERR_NOT_SUPPORTED;
|
||||
} else {
|
||||
/* cache for inclusion with job info at registration */
|
||||
cache = NULL;
|
||||
opal_list_remove_item(job_info, &info->super);
|
||||
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_INFO_CACHE, (void**)&cache, OPAL_PTR) &&
|
||||
NULL != cache) {
|
||||
opal_list_append(cache, &info->super);
|
||||
} else {
|
||||
cache = OBJ_NEW(opal_list_t);
|
||||
opal_list_append(cache, &info->super);
|
||||
orte_set_attribute(&jdata->attributes, ORTE_JOB_INFO_CACHE, ORTE_ATTR_LOCAL, (void*)cache, OPAL_PTR);
|
||||
}
|
||||
}
|
||||
}
|
||||
/* if the job is missing a personality setting, add it */
|
||||
if (NULL == jdata->personality) {
|
||||
opal_argv_append_nosize(&jdata->personality, "ompi");
|
||||
}
|
||||
|
||||
/* transfer the apps across */
|
||||
OPAL_LIST_FOREACH(papp, apps, opal_pmix_app_t) {
|
||||
app = OBJ_NEW(orte_app_context_t);
|
||||
@ -334,8 +240,9 @@ int pmix_server_spawn_fn(opal_process_name_t *requestor,
|
||||
app->cwd = opal_os_path(false, cwd, info->data.string, NULL);
|
||||
}
|
||||
} else if (0 == strcmp(info->key, OPAL_PMIX_PRELOAD_BIN)) {
|
||||
OPAL_CHECK_BOOL(info, flag);
|
||||
orte_set_attribute(&app->attributes, ORTE_APP_PRELOAD_BIN,
|
||||
ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL);
|
||||
ORTE_ATTR_GLOBAL, &flag, OPAL_BOOL);
|
||||
} else if (0 == strcmp(info->key, OPAL_PMIX_PRELOAD_FILES)) {
|
||||
orte_set_attribute(&app->attributes, ORTE_APP_PRELOAD_FILES,
|
||||
ORTE_ATTR_GLOBAL, info->data.string, OPAL_STRING);
|
||||
@ -347,9 +254,223 @@ int pmix_server_spawn_fn(opal_process_name_t *requestor,
|
||||
}
|
||||
}
|
||||
|
||||
/* transfer the job info across */
|
||||
OPAL_LIST_FOREACH_SAFE(info, next, job_info, opal_value_t) {
|
||||
/*** PERSONALITY ***/
|
||||
if (0 == strcmp(info->key, OPAL_PMIX_PERSONALITY)) {
|
||||
jdata->personality = opal_argv_split(info->data.string, ',');
|
||||
|
||||
/*** REQUESTED MAPPER ***/
|
||||
} else if (0 == strcmp(info->key, OPAL_PMIX_MAPPER)) {
|
||||
jdata->map->req_mapper = strdup(info->data.string);
|
||||
|
||||
/*** DISPLAY MAP ***/
|
||||
} else if (0 == strcmp(info->key, OPAL_PMIX_DISPLAY_MAP)) {
|
||||
OPAL_CHECK_BOOL(info, jdata->map->display_map);
|
||||
|
||||
/*** PPR (PROCS-PER-RESOURCE) ***/
|
||||
} else if (0 == strcmp(info->key, OPAL_PMIX_PPR)) {
|
||||
if (ORTE_MAPPING_POLICY_IS_SET(jdata->map->mapping)) {
|
||||
/* not allowed to provide multiple mapping policies */
|
||||
orte_show_help("help-orte-rmaps-base.txt", "redefining-policy",
|
||||
true, "mapping", info->data.string,
|
||||
orte_rmaps_base_print_mapping(orte_rmaps_base.mapping));
|
||||
return ORTE_ERR_BAD_PARAM;
|
||||
}
|
||||
ORTE_SET_MAPPING_DIRECTIVE(jdata->map->mapping, ORTE_MAPPING_PPR);
|
||||
jdata->map->ppr = strdup(info->data.string);
|
||||
|
||||
/*** MAP-BY ***/
|
||||
} else if (0 == strcmp(info->key, OPAL_PMIX_MAPBY)) {
|
||||
if (ORTE_MAPPING_POLICY_IS_SET(jdata->map->mapping)) {
|
||||
/* not allowed to provide multiple mapping policies */
|
||||
orte_show_help("help-orte-rmaps-base.txt", "redefining-policy",
|
||||
true, "mapping", info->data.string,
|
||||
orte_rmaps_base_print_mapping(orte_rmaps_base.mapping));
|
||||
return ORTE_ERR_BAD_PARAM;
|
||||
}
|
||||
rc = orte_rmaps_base_set_mapping_policy(&jdata->map->mapping,
|
||||
NULL, info->data.string);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
/*** RANK-BY ***/
|
||||
} else if (0 == strcmp(info->key, OPAL_PMIX_RANKBY)) {
|
||||
if (ORTE_RANKING_POLICY_IS_SET(jdata->map->ranking)) {
|
||||
/* not allowed to provide multiple ranking policies */
|
||||
orte_show_help("help-orte-rmaps-base.txt", "redefining-policy",
|
||||
true, "ranking", info->data.string,
|
||||
orte_rmaps_base_print_ranking(orte_rmaps_base.ranking));
|
||||
return ORTE_ERR_BAD_PARAM;
|
||||
}
|
||||
rc = orte_rmaps_base_set_ranking_policy(&jdata->map->ranking,
|
||||
jdata->map->mapping,
|
||||
info->data.string);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
/*** BIND-TO ***/
|
||||
} else if (0 == strcmp(info->key, OPAL_PMIX_BINDTO)) {
|
||||
if (OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) {
|
||||
/* not allowed to provide multiple mapping policies */
|
||||
orte_show_help("help-opal-hwloc-base.txt", "redefining-policy", true,
|
||||
info->data.string,
|
||||
opal_hwloc_base_print_binding(opal_hwloc_binding_policy));
|
||||
return ORTE_ERR_BAD_PARAM;
|
||||
}
|
||||
rc = opal_hwloc_base_set_binding_policy(&jdata->map->binding,
|
||||
info->data.string);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
/*** CPUS/RANK ***/
|
||||
} else if (0 == strcmp(info->key, OPAL_PMIX_CPUS_PER_PROC)) {
|
||||
jdata->map->cpus_per_rank = info->data.uint32;
|
||||
|
||||
/*** NO USE LOCAL ***/
|
||||
} else if (0 == strcmp(info->key, OPAL_PMIX_NO_PROCS_ON_HEAD)) {
|
||||
OPAL_CHECK_BOOL(info, flag);
|
||||
orte_set_attribute(&jdata->attributes, ORTE_MAPPING_NO_USE_LOCAL,
|
||||
ORTE_ATTR_GLOBAL, &flag, OPAL_BOOL);
|
||||
|
||||
/*** OVERSUBSCRIBE ***/
|
||||
} else if (0 == strcmp(info->key, OPAL_PMIX_NO_OVERSUBSCRIBE)) {
|
||||
OPAL_CHECK_BOOL(info, flag);
|
||||
if (flag) {
|
||||
ORTE_SET_MAPPING_DIRECTIVE(jdata->map->mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE);
|
||||
} else {
|
||||
ORTE_UNSET_MAPPING_DIRECTIVE(jdata->map->mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE);
|
||||
}
|
||||
|
||||
/*** REPORT BINDINGS ***/
|
||||
} else if (0 == strcmp(info->key, OPAL_PMIX_REPORT_BINDINGS)) {
|
||||
OPAL_CHECK_BOOL(info, flag);
|
||||
orte_set_attribute(&jdata->attributes, ORTE_JOB_REPORT_BINDINGS,
|
||||
ORTE_ATTR_GLOBAL, &flag, OPAL_BOOL);
|
||||
|
||||
/*** CPU LIST ***/
|
||||
} else if (0 == strcmp(info->key, OPAL_PMIX_CPU_LIST)) {
|
||||
orte_set_attribute(&jdata->attributes, ORTE_JOB_CPU_LIST,
|
||||
ORTE_ATTR_GLOBAL, info->data.string, OPAL_BOOL);
|
||||
|
||||
/*** RECOVERABLE ***/
|
||||
} else if (0 == strcmp(info->key, OPAL_PMIX_JOB_RECOVERABLE)) {
|
||||
OPAL_CHECK_BOOL(info, flag);
|
||||
if (flag) {
|
||||
ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_RECOVERABLE);
|
||||
} else {
|
||||
ORTE_FLAG_UNSET(jdata, ORTE_JOB_FLAG_RECOVERABLE);
|
||||
}
|
||||
|
||||
/*** MAX RESTARTS ***/
|
||||
} else if (0 == strcmp(info->key, OPAL_PMIX_MAX_RESTARTS)) {
|
||||
for (i=0; i < jdata->apps->size; i++) {
|
||||
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
|
||||
continue;
|
||||
}
|
||||
orte_set_attribute(&app->attributes, ORTE_APP_MAX_RESTARTS,
|
||||
ORTE_ATTR_GLOBAL, &info->data.uint32, OPAL_INT32);
|
||||
}
|
||||
|
||||
/*** CONTINUOUS OPERATION ***/
|
||||
} else if (0 == strcmp(info->key, OPAL_PMIX_JOB_CONTINUOUS)) {
|
||||
OPAL_CHECK_BOOL(info, flag);
|
||||
orte_set_attribute(&jdata->attributes, ORTE_JOB_CONTINUOUS_OP,
|
||||
ORTE_ATTR_GLOBAL, &flag, OPAL_BOOL);
|
||||
|
||||
/*** NON-PMI JOB ***/
|
||||
} else if (0 == strcmp(info->key, OPAL_PMIX_NON_PMI)) {
|
||||
OPAL_CHECK_BOOL(info, flag);
|
||||
orte_set_attribute(&jdata->attributes, ORTE_JOB_NON_ORTE_JOB,
|
||||
ORTE_ATTR_GLOBAL, &flag, OPAL_BOOL);
|
||||
|
||||
/*** SPAWN REQUESTOR IS TOOL ***/
|
||||
} else if (0 == strcmp(info->key, OPAL_PMIX_REQUESTOR_IS_TOOL)) {
|
||||
OPAL_CHECK_BOOL(info, flag);
|
||||
orte_set_attribute(&jdata->attributes, ORTE_JOB_DVM_JOB,
|
||||
ORTE_ATTR_GLOBAL, &flag, OPAL_BOOL);
|
||||
if (flag) {
|
||||
/* request that IO be forwarded to the requesting tool */
|
||||
orte_set_attribute(&jdata->attributes, ORTE_JOB_FWDIO_TO_TOOL,
|
||||
ORTE_ATTR_GLOBAL, &flag, OPAL_BOOL);
|
||||
}
|
||||
|
||||
/*** NOTIFY UPON JOB COMPLETION ***/
|
||||
} else if (0 == strcmp(info->key, OPAL_PMIX_NOTIFY_COMPLETION)) {
|
||||
OPAL_CHECK_BOOL(info, flag);
|
||||
orte_set_attribute(&jdata->attributes, ORTE_JOB_NOTIFY_COMPLETION,
|
||||
ORTE_ATTR_GLOBAL, &flag, OPAL_BOOL);
|
||||
|
||||
/*** STOP ON EXEC FOR DEBUGGER ***/
|
||||
} else if (0 == strcmp(info->key, OPAL_PMIX_DEBUG_STOP_ON_EXEC)) {
|
||||
/* we don't know how to do this */
|
||||
return ORTE_ERR_NOT_SUPPORTED;
|
||||
|
||||
/*** TAG STDOUT ***/
|
||||
} else if (0 == strcmp(info->key, OPAL_PMIX_TAG_OUTPUT)) {
|
||||
OPAL_CHECK_BOOL(info, flag);
|
||||
orte_set_attribute(&jdata->attributes, ORTE_JOB_TAG_OUTPUT,
|
||||
ORTE_ATTR_GLOBAL, &flag, OPAL_BOOL);
|
||||
|
||||
/*** TIMESTAMP OUTPUT ***/
|
||||
} else if (0 == strcmp(info->key, OPAL_PMIX_TIMESTAMP_OUTPUT)) {
|
||||
OPAL_CHECK_BOOL(info, flag);
|
||||
orte_set_attribute(&jdata->attributes, ORTE_JOB_TIMESTAMP_OUTPUT,
|
||||
ORTE_ATTR_GLOBAL, &flag, OPAL_BOOL);
|
||||
|
||||
/*** OUTPUT TO FILES ***/
|
||||
} else if (0 == strcmp(info->key, OPAL_PMIX_OUTPUT_TO_FILE)) {
|
||||
orte_set_attribute(&jdata->attributes, ORTE_JOB_OUTPUT_TO_FILE,
|
||||
ORTE_ATTR_GLOBAL, info->data.string, OPAL_STRING);
|
||||
|
||||
/*** MERGE STDERR TO STDOUT ***/
|
||||
} else if (0 == strcmp(info->key, OPAL_PMIX_MERGE_STDERR_STDOUT)) {
|
||||
OPAL_CHECK_BOOL(info, flag);
|
||||
orte_set_attribute(&jdata->attributes, ORTE_JOB_MERGE_STDERR_STDOUT,
|
||||
ORTE_ATTR_GLOBAL, &flag, OPAL_BOOL);
|
||||
|
||||
/*** STDIN TARGET ***/
|
||||
} else if (0 == strcmp(info->key, OPAL_PMIX_STDIN_TGT)) {
|
||||
if (0 == strcmp(info->data.string, "all")) {
|
||||
jdata->stdin_target = ORTE_VPID_WILDCARD;
|
||||
} else if (0 == strcmp(info->data.string, "none")) {
|
||||
jdata->stdin_target = ORTE_VPID_INVALID;
|
||||
} else {
|
||||
jdata->stdin_target = strtoul(info->data.string, NULL, 10);
|
||||
}
|
||||
|
||||
/*** INDEX ARGV ***/
|
||||
} else if (0 == strcmp(info->key, OPAL_PMIX_INDEX_ARGV)) {
|
||||
OPAL_CHECK_BOOL(info, flag);
|
||||
orte_set_attribute(&jdata->attributes, ORTE_JOB_INDEX_ARGV,
|
||||
ORTE_ATTR_GLOBAL, &flag, OPAL_BOOL);
|
||||
|
||||
/*** DEFAULT - CACHE FOR INCLUSION WITH JOB INFO ***/
|
||||
} else {
|
||||
/* cache for inclusion with job info at registration */
|
||||
cache = NULL;
|
||||
opal_list_remove_item(job_info, &info->super);
|
||||
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_INFO_CACHE, (void**)&cache, OPAL_PTR) &&
|
||||
NULL != cache) {
|
||||
opal_list_append(cache, &info->super);
|
||||
} else {
|
||||
cache = OBJ_NEW(opal_list_t);
|
||||
opal_list_append(cache, &info->super);
|
||||
orte_set_attribute(&jdata->attributes, ORTE_JOB_INFO_CACHE, ORTE_ATTR_LOCAL, (void*)cache, OPAL_PTR);
|
||||
}
|
||||
}
|
||||
}
|
||||
/* if the job is missing a personality setting, add it */
|
||||
if (NULL == jdata->personality) {
|
||||
opal_argv_append_nosize(&jdata->personality, "ompi");
|
||||
}
|
||||
|
||||
/* indicate the requestor so bookmarks can be correctly set */
|
||||
orte_set_attribute(&jdata->attributes, ORTE_JOB_LAUNCH_PROXY, ORTE_ATTR_GLOBAL,
|
||||
requestor, OPAL_NAME);
|
||||
orte_set_attribute(&jdata->attributes, ORTE_JOB_LAUNCH_PROXY,
|
||||
ORTE_ATTR_GLOBAL, requestor, OPAL_NAME);
|
||||
|
||||
/* setup a spawn tracker so we know who to call back when this is done
|
||||
* and thread-shift the entire thing so it can be safely added to
|
||||
|
@ -785,10 +785,12 @@ static void _toolconn(int sd, short args, void *cbdata)
|
||||
OBJ_RETAIN(node);
|
||||
opal_pointer_array_add(jdata->map->nodes, node);
|
||||
jdata->map->num_nodes++;
|
||||
/* and it obviously is on the node */
|
||||
/* and it obviously is on the node - note that
|
||||
* we do _not_ increment the #procs on the node
|
||||
* as the tool doesn't count against the slot
|
||||
* allocation */
|
||||
OBJ_RETAIN(proc);
|
||||
opal_pointer_array_add(node->procs, proc);
|
||||
node->num_procs++;
|
||||
/* set the trivial */
|
||||
proc->local_rank = 0;
|
||||
proc->node_rank = 0;
|
||||
|
@ -82,70 +82,20 @@
|
||||
#include "orte/runtime/runtime.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/schizo/base/base.h"
|
||||
#include "orte/mca/state/state.h"
|
||||
#include "orte/orted/orted_submit.h"
|
||||
|
||||
/* ensure I can behave like a daemon */
|
||||
#include "prun.h"
|
||||
|
||||
/**
|
||||
* Global struct for caching orte command line options.
|
||||
*/
|
||||
struct orte_cmd_options_t {
|
||||
char *help;
|
||||
bool version;
|
||||
bool verbose;
|
||||
char *report_pid;
|
||||
char *report_uri;
|
||||
bool terminate;
|
||||
bool debugger;
|
||||
int num_procs;
|
||||
char *appfile;
|
||||
char *wdir;
|
||||
bool set_cwd_to_session_dir;
|
||||
char *path;
|
||||
char *preload_files;
|
||||
bool sleep;
|
||||
char *stdin_target;
|
||||
char *prefix;
|
||||
char *path_to_mpirun;
|
||||
bool disable_recovery;
|
||||
bool preload_binaries;
|
||||
bool index_argv;
|
||||
bool run_as_root;
|
||||
char *personality;
|
||||
bool create_dvm;
|
||||
static struct {
|
||||
bool terminate_dvm;
|
||||
bool nolocal;
|
||||
bool no_oversubscribe;
|
||||
bool oversubscribe;
|
||||
int cpus_per_proc;
|
||||
bool pernode;
|
||||
int npernode;
|
||||
bool use_hwthreads_as_cpus;
|
||||
int npersocket;
|
||||
char *mapping_policy;
|
||||
char *ranking_policy;
|
||||
char *binding_policy;
|
||||
bool report_bindings;
|
||||
char *cpu_list;
|
||||
bool debug;
|
||||
bool tag_output;
|
||||
bool timestamp_output;
|
||||
char *output_filename;
|
||||
bool merge;
|
||||
bool continuous;
|
||||
char *hnp;
|
||||
bool staged_exec;
|
||||
int timeout;
|
||||
bool report_state_on_timeout;
|
||||
bool get_stack_traces;
|
||||
int pid;
|
||||
bool system_server_only;
|
||||
bool system_server_first;
|
||||
};
|
||||
typedef struct orte_cmd_options_t orte_cmd_options_t;
|
||||
static orte_cmd_options_t orte_cmd_options = {0};
|
||||
static opal_cmd_line_t *orte_cmd_line = NULL;
|
||||
bool system_server_only;
|
||||
int pid;
|
||||
} myoptions;
|
||||
|
||||
static opal_list_t job_info;
|
||||
static volatile bool active = false;
|
||||
|
||||
@ -158,335 +108,24 @@ static void set_classpath_jar_file(opal_pmix_app_t *app, int index, char *jarfil
|
||||
|
||||
|
||||
static opal_cmd_line_init_t cmd_line_init[] = {
|
||||
/* Various "obvious" options */
|
||||
{ NULL, 'h', NULL, "help", 1,
|
||||
&orte_cmd_options.help, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"This help message", OPAL_CMD_LINE_OTYPE_GENERAL },
|
||||
{ NULL, 'V', NULL, "version", 0,
|
||||
&orte_cmd_options.version, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Print version and exit", OPAL_CMD_LINE_OTYPE_GENERAL },
|
||||
{ "orte_execute_quiet", 'q', NULL, "quiet", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Suppress helpful messages", OPAL_CMD_LINE_OTYPE_GENERAL },
|
||||
|
||||
/* exit status reporting */
|
||||
{ "orte_report_child_jobs_separately", '\0', "report-child-jobs-separately", "report-child-jobs-separately", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Return the exit status of the primary job only", OPAL_CMD_LINE_OTYPE_OUTPUT },
|
||||
|
||||
/* select XML output */
|
||||
{ "orte_xml_output", '\0', "xml", "xml", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Provide all output in XML format", OPAL_CMD_LINE_OTYPE_OUTPUT },
|
||||
{ "orte_xml_file", '\0', "xml-file", "xml-file", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Provide all output in XML format to the specified file", OPAL_CMD_LINE_OTYPE_OUTPUT },
|
||||
|
||||
/* tag output */
|
||||
{ "orte_tag_output", '\0', "tag-output", "tag-output", 0,
|
||||
&orte_cmd_options.tag_output, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Tag all output with [job,rank]", OPAL_CMD_LINE_OTYPE_OUTPUT },
|
||||
{ "orte_timestamp_output", '\0', "timestamp-output", "timestamp-output", 0,
|
||||
&orte_cmd_options.timestamp_output, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Timestamp all application process output", OPAL_CMD_LINE_OTYPE_OUTPUT },
|
||||
{ "orte_output_filename", '\0', "output-filename", "output-filename", 1,
|
||||
&orte_cmd_options.output_filename, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Redirect output from application processes into filename/job/rank/std[out,err,diag]",
|
||||
OPAL_CMD_LINE_OTYPE_OUTPUT },
|
||||
{ NULL, '\0', "merge-stderr-to-stdout", "merge-stderr-to-stdout", 0,
|
||||
&orte_cmd_options.merge, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Merge stderr to stdout for each process", OPAL_CMD_LINE_OTYPE_OUTPUT },
|
||||
{ "orte_xterm", '\0', "xterm", "xterm", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Create a new xterm window and display output from the specified ranks there",
|
||||
OPAL_CMD_LINE_OTYPE_OUTPUT },
|
||||
|
||||
/* select stdin option */
|
||||
{ NULL, '\0', "stdin", "stdin", 1,
|
||||
&orte_cmd_options.stdin_target, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Specify procs to receive stdin [rank, all, none] (default: 0, indicating rank 0)",
|
||||
OPAL_CMD_LINE_OTYPE_INPUT },
|
||||
|
||||
/* request that argv[0] be indexed */
|
||||
{ NULL, '\0', "index-argv-by-rank", "index-argv-by-rank", 0,
|
||||
&orte_cmd_options.index_argv, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Uniquely index argv[0] for each process using its rank",
|
||||
OPAL_CMD_LINE_OTYPE_INPUT },
|
||||
|
||||
/* Specify the launch agent to be used */
|
||||
{ "orte_launch_agent", '\0', "launch-agent", "launch-agent", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Command used to start processes on remote nodes (default: orted)",
|
||||
OPAL_CMD_LINE_OTYPE_LAUNCH },
|
||||
|
||||
/* Preload the binary on the remote machine */
|
||||
{ NULL, 's', NULL, "preload-binary", 0,
|
||||
&orte_cmd_options.preload_binaries, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Preload the binary on the remote machine before starting the remote process.",
|
||||
OPAL_CMD_LINE_OTYPE_LAUNCH },
|
||||
|
||||
/* Preload files on the remote machine */
|
||||
{ NULL, '\0', NULL, "preload-files", 1,
|
||||
&orte_cmd_options.preload_files, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Preload the comma separated list of files to the remote machines current working directory before starting the remote process.",
|
||||
OPAL_CMD_LINE_OTYPE_LAUNCH },
|
||||
|
||||
/* Use an appfile */
|
||||
{ NULL, '\0', NULL, "app", 1,
|
||||
&orte_cmd_options.appfile, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Provide an appfile; ignore all other command line options",
|
||||
OPAL_CMD_LINE_OTYPE_LAUNCH },
|
||||
|
||||
/* Number of processes; -c, -n, --n, -np, and --np are all
|
||||
synonyms */
|
||||
{ NULL, 'c', "np", "np", 1,
|
||||
&orte_cmd_options.num_procs, OPAL_CMD_LINE_TYPE_INT,
|
||||
"Number of processes to run", OPAL_CMD_LINE_OTYPE_GENERAL },
|
||||
{ NULL, '\0', "n", "n", 1,
|
||||
&orte_cmd_options.num_procs, OPAL_CMD_LINE_TYPE_INT,
|
||||
"Number of processes to run", OPAL_CMD_LINE_OTYPE_GENERAL },
|
||||
|
||||
/* Set a hostfile */
|
||||
{ NULL, '\0', "hostfile", "hostfile", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Provide a hostfile", OPAL_CMD_LINE_OTYPE_LAUNCH },
|
||||
{ NULL, '\0', "machinefile", "machinefile", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Provide a hostfile", OPAL_CMD_LINE_OTYPE_LAUNCH },
|
||||
{ "orte_default_hostfile", '\0', "default-hostfile", "default-hostfile", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Provide a default hostfile", OPAL_CMD_LINE_OTYPE_LAUNCH },
|
||||
{ "opal_if_do_not_resolve", '\0', "do-not-resolve", "do-not-resolve", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Do not attempt to resolve interfaces", OPAL_CMD_LINE_OTYPE_DEVEL },
|
||||
|
||||
{ "orte_rankfile", '\0', "rf", "rankfile", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Provide a rankfile file", OPAL_CMD_LINE_OTYPE_MAPPING },
|
||||
|
||||
/* Export environment variables; potentially used multiple times,
|
||||
so it does not make sense to set into a variable */
|
||||
{ NULL, 'x', NULL, NULL, 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_NULL,
|
||||
"Export an environment variable, optionally specifying a value (e.g., \"-x foo\" exports the environment variable foo and takes its value from the current environment; \"-x foo=bar\" exports the environment variable name foo and sets its value to \"bar\" in the started processes)", OPAL_CMD_LINE_OTYPE_LAUNCH },
|
||||
|
||||
/* Mapping controls */
|
||||
{ "rmaps_base_display_map", '\0', "display-map", "display-map", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Display the process map just before launch", OPAL_CMD_LINE_OTYPE_DEBUG },
|
||||
{ "rmaps_base_display_devel_map", '\0', "display-devel-map", "display-devel-map", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Display a detailed process map (mostly intended for developers) just before launch",
|
||||
OPAL_CMD_LINE_OTYPE_DEVEL },
|
||||
{ "rmaps_base_display_topo_with_map", '\0', "display-topo", "display-topo", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Display the topology as part of the process map (mostly intended for developers) just before launch",
|
||||
OPAL_CMD_LINE_OTYPE_DEVEL },
|
||||
{ "rmaps_base_display_diffable_map", '\0', "display-diffable-map", "display-diffable-map", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Display a diffable process map (mostly intended for developers) just before launch",
|
||||
OPAL_CMD_LINE_OTYPE_DEVEL },
|
||||
{ NULL, 'H', "host", "host", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"List of hosts to invoke processes on",
|
||||
OPAL_CMD_LINE_OTYPE_MAPPING },
|
||||
{ "rmaps_base_no_schedule_local", '\0', "nolocal", "nolocal", 0,
|
||||
&orte_cmd_options.nolocal, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Do not run any MPI applications on the local node",
|
||||
OPAL_CMD_LINE_OTYPE_MAPPING },
|
||||
{ "rmaps_base_no_oversubscribe", '\0', "nooversubscribe", "nooversubscribe", 0,
|
||||
&orte_cmd_options.no_oversubscribe, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Nodes are not to be oversubscribed, even if the system supports such operation",
|
||||
OPAL_CMD_LINE_OTYPE_MAPPING },
|
||||
{ "rmaps_base_oversubscribe", '\0', "oversubscribe", "oversubscribe", 0,
|
||||
&orte_cmd_options.oversubscribe, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Nodes are allowed to be oversubscribed, even on a managed system, and overloading of processing elements",
|
||||
OPAL_CMD_LINE_OTYPE_MAPPING },
|
||||
{ "rmaps_base_cpus_per_rank", '\0', "cpus-per-proc", "cpus-per-proc", 1,
|
||||
&orte_cmd_options.cpus_per_proc, OPAL_CMD_LINE_TYPE_INT,
|
||||
"Number of cpus to use for each process [default=1]",
|
||||
OPAL_CMD_LINE_OTYPE_MAPPING },
|
||||
{ "rmaps_base_cpus_per_rank", '\0', "cpus-per-rank", "cpus-per-rank", 1,
|
||||
&orte_cmd_options.cpus_per_proc, OPAL_CMD_LINE_TYPE_INT,
|
||||
"Synonym for cpus-per-proc", OPAL_CMD_LINE_OTYPE_MAPPING },
|
||||
|
||||
/* backward compatiblity */
|
||||
{ "rmaps_base_bycore", '\0', "bycore", "bycore", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Whether to map and rank processes round-robin by core",
|
||||
OPAL_CMD_LINE_OTYPE_COMPAT },
|
||||
{ "rmaps_base_bynode", '\0', "bynode", "bynode", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Whether to map and rank processes round-robin by node",
|
||||
OPAL_CMD_LINE_OTYPE_COMPAT },
|
||||
{ "rmaps_base_byslot", '\0', "byslot", "byslot", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Whether to map and rank processes round-robin by slot",
|
||||
OPAL_CMD_LINE_OTYPE_COMPAT },
|
||||
|
||||
/* Nperxxx options that do not require topology and are always
|
||||
* available - included for backwards compatibility
|
||||
*/
|
||||
{ "rmaps_ppr_pernode", '\0', "pernode", "pernode", 0,
|
||||
&orte_cmd_options.pernode, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Launch one process per available node",
|
||||
OPAL_CMD_LINE_OTYPE_COMPAT },
|
||||
{ "rmaps_ppr_n_pernode", '\0', "npernode", "npernode", 1,
|
||||
&orte_cmd_options.npernode, OPAL_CMD_LINE_TYPE_INT,
|
||||
"Launch n processes per node on all allocated nodes",
|
||||
OPAL_CMD_LINE_OTYPE_COMPAT },
|
||||
{ "rmaps_ppr_n_pernode", '\0', "N", NULL, 1,
|
||||
&orte_cmd_options.npernode, OPAL_CMD_LINE_TYPE_INT,
|
||||
"Launch n processes per node on all allocated nodes (synonym for 'map-by node')",
|
||||
OPAL_CMD_LINE_OTYPE_MAPPING },
|
||||
|
||||
/* declare hardware threads as independent cpus */
|
||||
{ "hwloc_base_use_hwthreads_as_cpus", '\0', "use-hwthread-cpus", "use-hwthread-cpus", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Use hardware threads as independent cpus", OPAL_CMD_LINE_OTYPE_MAPPING },
|
||||
|
||||
/* include npersocket for backwards compatibility */
|
||||
{ "rmaps_ppr_n_persocket", '\0', "npersocket", "npersocket", 1,
|
||||
&orte_cmd_options.npersocket, OPAL_CMD_LINE_TYPE_INT,
|
||||
"Launch n processes per socket on all allocated nodes",
|
||||
OPAL_CMD_LINE_OTYPE_COMPAT },
|
||||
|
||||
/* Mapping options */
|
||||
{ "rmaps_base_mapping_policy", '\0', NULL, "map-by", 1,
|
||||
&orte_cmd_options.mapping_policy, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Mapping Policy [slot | hwthread | core | socket (default) | numa | board | node]",
|
||||
OPAL_CMD_LINE_OTYPE_MAPPING },
|
||||
|
||||
/* Ranking options */
|
||||
{ "rmaps_base_ranking_policy", '\0', NULL, "rank-by", 1,
|
||||
&orte_cmd_options.ranking_policy, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Ranking Policy [slot (default) | hwthread | core | socket | numa | board | node]",
|
||||
OPAL_CMD_LINE_OTYPE_RANKING },
|
||||
|
||||
/* Binding options */
|
||||
{ "hwloc_base_binding_policy", '\0', NULL, "bind-to", 1,
|
||||
&orte_cmd_options.binding_policy, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Policy for binding processes. Allowed values: none, hwthread, core, l1cache, l2cache, l3cache, socket, numa, board (\"none\" is the default when oversubscribed, \"core\" is the default when np<=2, and \"socket\" is the default when np>2). Allowed qualifiers: overload-allowed, if-supported", OPAL_CMD_LINE_OTYPE_BINDING },
|
||||
|
||||
/* backward compatiblity */
|
||||
{ "hwloc_base_bind_to_core", '\0', "bind-to-core", "bind-to-core", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Bind processes to cores", OPAL_CMD_LINE_OTYPE_COMPAT },
|
||||
{ "hwloc_base_bind_to_socket", '\0', "bind-to-socket", "bind-to-socket", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Bind processes to sockets", OPAL_CMD_LINE_OTYPE_COMPAT },
|
||||
|
||||
{ "hwloc_base_report_bindings", '\0', "report-bindings", "report-bindings", 0,
|
||||
&orte_cmd_options.report_bindings, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Whether to report process bindings to stderr",
|
||||
OPAL_CMD_LINE_OTYPE_BINDING },
|
||||
|
||||
/* slot list option */
|
||||
{ "hwloc_base_cpu_list", '\0', "cpu-list", "cpu-list", 1,
|
||||
&orte_cmd_options.cpu_list, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"List of processor IDs to bind processes to [default=NULL]",
|
||||
OPAL_CMD_LINE_OTYPE_BINDING },
|
||||
|
||||
/* generalized pattern mapping option */
|
||||
{ "rmaps_ppr_pattern", '\0', NULL, "ppr", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Comma-separated list of number of processes on a given resource type [default: none]",
|
||||
OPAL_CMD_LINE_OTYPE_MAPPING },
|
||||
|
||||
/* Allocation options */
|
||||
{ "orte_display_alloc", '\0', "display-allocation", "display-allocation", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Display the allocation being used by this job", OPAL_CMD_LINE_OTYPE_DEBUG },
|
||||
{ "orte_display_devel_alloc", '\0', "display-devel-allocation", "display-devel-allocation", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Display a detailed list (mostly intended for developers) of the allocation being used by this job",
|
||||
OPAL_CMD_LINE_OTYPE_DEVEL },
|
||||
{ "hwloc_base_cpu_set", '\0', "cpu-set", "cpu-set", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Comma-separated list of ranges specifying logical cpus allocated to this job [default: none]",
|
||||
OPAL_CMD_LINE_OTYPE_DEBUG },
|
||||
|
||||
/* mpiexec-like arguments */
|
||||
{ NULL, '\0', "wdir", "wdir", 1,
|
||||
&orte_cmd_options.wdir, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Set the working directory of the started processes",
|
||||
OPAL_CMD_LINE_OTYPE_LAUNCH },
|
||||
{ NULL, '\0', "wd", "wd", 1,
|
||||
&orte_cmd_options.wdir, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Synonym for --wdir", OPAL_CMD_LINE_OTYPE_LAUNCH },
|
||||
{ NULL, '\0', "set-cwd-to-session-dir", "set-cwd-to-session-dir", 0,
|
||||
&orte_cmd_options.set_cwd_to_session_dir, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Set the working directory of the started processes to their session directory",
|
||||
OPAL_CMD_LINE_OTYPE_LAUNCH },
|
||||
{ NULL, '\0', "path", "path", 1,
|
||||
&orte_cmd_options.path, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"PATH to be used to look for executables to start processes",
|
||||
OPAL_CMD_LINE_OTYPE_LAUNCH },
|
||||
|
||||
/* User-level debugger arguments */
|
||||
{ NULL, '\0', "tv", "tv", 0,
|
||||
&orte_cmd_options.debugger, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Deprecated backwards compatibility flag; synonym for \"--debug\"",
|
||||
OPAL_CMD_LINE_OTYPE_DEBUG },
|
||||
{ NULL, '\0', "debug", "debug", 0,
|
||||
&orte_cmd_options.debugger, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Invoke the user-level debugger indicated by the orte_base_user_debugger MCA parameter",
|
||||
OPAL_CMD_LINE_OTYPE_DEBUG },
|
||||
{ "orte_base_user_debugger", '\0', "debugger", "debugger", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Sequence of debuggers to search for when \"--debug\" is used",
|
||||
OPAL_CMD_LINE_OTYPE_DEBUG },
|
||||
{ "orte_output_debugger_proctable", '\0', "output-proctable", "output-proctable", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Output the debugger proctable after launch",
|
||||
OPAL_CMD_LINE_OTYPE_DEBUG },
|
||||
|
||||
{ "orte_report_events", '\0', "report-events", "report-events", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Report events to a tool listening at the specified URI", OPAL_CMD_LINE_OTYPE_DEBUG },
|
||||
|
||||
{ "orte_enable_recovery", '\0', "enable-recovery", "enable-recovery", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Enable recovery from process failure [Default = disabled]",
|
||||
OPAL_CMD_LINE_OTYPE_UNSUPPORTED },
|
||||
|
||||
{ "orte_max_restarts", '\0', "max-restarts", "max-restarts", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_INT,
|
||||
"Max number of times to restart a failed process",
|
||||
OPAL_CMD_LINE_OTYPE_UNSUPPORTED },
|
||||
|
||||
{ NULL, '\0', "continuous", "continuous", 0,
|
||||
&orte_cmd_options.continuous, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Job is to run until explicitly terminated", OPAL_CMD_LINE_OTYPE_DEBUG },
|
||||
|
||||
{ NULL, '\0', "disable-recovery", "disable-recovery", 0,
|
||||
&orte_cmd_options.disable_recovery, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Disable recovery (resets all recovery options to off)",
|
||||
OPAL_CMD_LINE_OTYPE_UNSUPPORTED },
|
||||
|
||||
{ NULL, '\0', "personality", "personality", 1,
|
||||
&orte_cmd_options.personality, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Comma-separated list of programming model, languages, and containers being used (default=\"ompi\")",
|
||||
OPAL_CMD_LINE_OTYPE_LAUNCH },
|
||||
|
||||
/* tell the dvm to terminate */
|
||||
{ NULL, '\0', "terminate", "terminate", 0,
|
||||
&orte_cmd_options.terminate_dvm, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
&myoptions.terminate_dvm, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Terminate the DVM", OPAL_CMD_LINE_OTYPE_DVM },
|
||||
|
||||
/* look first for a system server */
|
||||
{ NULL, '\0', "system-server-first", "system-server-first", 0,
|
||||
&orte_cmd_options.system_server_first, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
&myoptions.system_server_first, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"First look for a system server and connect to it if found", OPAL_CMD_LINE_OTYPE_DVM },
|
||||
|
||||
/* connect only to a system server */
|
||||
{ NULL, '\0', "system-server-only", "system-server-only", 0,
|
||||
&orte_cmd_options.system_server_only, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
&myoptions.system_server_only, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Connect only to a system-level server", OPAL_CMD_LINE_OTYPE_DVM },
|
||||
|
||||
/* provide a connection PID */
|
||||
{ NULL, '\0', "pid", "pid", 1,
|
||||
&orte_cmd_options.pid, OPAL_CMD_LINE_TYPE_INT,
|
||||
&myoptions.pid, OPAL_CMD_LINE_TYPE_INT,
|
||||
"PID of the session-level daemon to which we should connect",
|
||||
OPAL_CMD_LINE_OTYPE_DVM },
|
||||
|
||||
@ -556,6 +195,7 @@ int prun(int argc, char *argv[])
|
||||
char *param;
|
||||
opal_pmix_lock_t lock;
|
||||
opal_list_t apps;
|
||||
opal_pmix_app_t *app;
|
||||
opal_value_t *val;
|
||||
opal_list_t info;
|
||||
opal_jobid_t jobid;
|
||||
@ -563,6 +203,7 @@ int prun(int argc, char *argv[])
|
||||
|
||||
/* init the globals */
|
||||
memset(&orte_cmd_options, 0, sizeof(orte_cmd_options));
|
||||
memset(&myoptions, 0, sizeof(myoptions));
|
||||
OBJ_CONSTRUCT(&job_info, opal_list_t);
|
||||
OBJ_CONSTRUCT(&apps, opal_list_t);
|
||||
|
||||
@ -597,12 +238,30 @@ int prun(int argc, char *argv[])
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* set our proc type for schizo selection */
|
||||
orte_process_info.proc_type = ORTE_PROC_TOOL;
|
||||
|
||||
/* open the SCHIZO framework so we can setup the command line */
|
||||
if (ORTE_SUCCESS != (rc = mca_base_framework_open(&orte_schizo_base_framework, 0))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = orte_schizo_base_select())) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* setup our cmd line */
|
||||
orte_cmd_line = OBJ_NEW(opal_cmd_line_t);
|
||||
if (OPAL_SUCCESS != (rc = opal_cmd_line_add(orte_cmd_line, cmd_line_init))) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* setup the rest of the cmd line only once */
|
||||
if (OPAL_SUCCESS != (rc = orte_schizo.define_cli(orte_cmd_line))) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* now that options have been defined, finish setup */
|
||||
mca_base_cmd_line_setup(orte_cmd_line);
|
||||
|
||||
@ -684,17 +343,16 @@ int prun(int argc, char *argv[])
|
||||
exit(0);
|
||||
}
|
||||
|
||||
/* tell the ess/tool component that we want to connect only to a system-level
|
||||
* PMIx server */
|
||||
if (orte_cmd_options.system_server_only) {
|
||||
/* tell the ess/tool component how we want to connect */
|
||||
if (myoptions.system_server_only) {
|
||||
opal_setenv(OPAL_MCA_PREFIX"ess_tool_system_server_only", "1", true, &environ);
|
||||
}
|
||||
if (orte_cmd_options.system_server_first) {
|
||||
if (myoptions.system_server_first) {
|
||||
opal_setenv(OPAL_MCA_PREFIX"ess_tool_system_server_first", "1", true, &environ);
|
||||
}
|
||||
/* if they specified the DVM's pid, then pass it along */
|
||||
if (0 != orte_cmd_options.pid) {
|
||||
asprintf(¶m, "%d", orte_cmd_options.pid);
|
||||
if (0 != myoptions.pid) {
|
||||
asprintf(¶m, "%d", myoptions.pid);
|
||||
opal_setenv(OPAL_MCA_PREFIX"ess_tool_server_pid", param, true, &environ);
|
||||
free(param);
|
||||
}
|
||||
@ -706,7 +364,7 @@ int prun(int argc, char *argv[])
|
||||
}
|
||||
|
||||
/* if the user just wants us to terminate a DVM, then do so */
|
||||
if (orte_cmd_options.terminate_dvm) {
|
||||
if (myoptions.terminate_dvm) {
|
||||
OBJ_CONSTRUCT(&info, opal_list_t);
|
||||
val = OBJ_NEW(opal_value_t);
|
||||
val->key = strdup(OPAL_PMIX_JOB_CTRL_TERMINATE);
|
||||
@ -757,6 +415,200 @@ int prun(int argc, char *argv[])
|
||||
OPAL_PMIX_DESTRUCT_LOCK(&lock);
|
||||
OPAL_LIST_DESTRUCT(&info);
|
||||
|
||||
/* we want to be notified upon job completion */
|
||||
val = OBJ_NEW(opal_value_t);
|
||||
val->key = strdup(OPAL_PMIX_NOTIFY_COMPLETION);
|
||||
val->type = OPAL_BOOL;
|
||||
val->data.flag = true;
|
||||
opal_list_append(&job_info, &val->super);
|
||||
|
||||
/* see if they specified the personality */
|
||||
if (NULL != orte_cmd_options.personality) {
|
||||
val = OBJ_NEW(opal_value_t);
|
||||
val->key = strdup(OPAL_PMIX_PERSONALITY);
|
||||
val->type = OPAL_STRING;
|
||||
val->data.string = strdup(orte_cmd_options.personality);
|
||||
opal_list_append(&job_info, &val->super);
|
||||
}
|
||||
|
||||
/* check for stdout/err directives */
|
||||
/* if we were asked to tag output, mark it so */
|
||||
if (orte_cmd_options.tag_output) {
|
||||
val = OBJ_NEW(opal_value_t);
|
||||
val->key = strdup(OPAL_PMIX_TAG_OUTPUT);
|
||||
val->type = OPAL_BOOL;
|
||||
val->data.flag = true;
|
||||
opal_list_append(&job_info, &val->super);
|
||||
}
|
||||
/* if we were asked to timestamp output, mark it so */
|
||||
if (orte_cmd_options.timestamp_output) {
|
||||
val = OBJ_NEW(opal_value_t);
|
||||
val->key = strdup(OPAL_PMIX_TIMESTAMP_OUTPUT);
|
||||
val->type = OPAL_BOOL;
|
||||
val->data.flag = true;
|
||||
opal_list_append(&job_info, &val->super);
|
||||
}
|
||||
/* if we were asked to output to files, pass it along */
|
||||
if (NULL != orte_cmd_options.output_filename) {
|
||||
val = OBJ_NEW(opal_value_t);
|
||||
val->key = strdup(OPAL_PMIX_OUTPUT_TO_FILE);
|
||||
val->type = OPAL_STRING;
|
||||
val->data.string = strdup(orte_cmd_options.output_filename);
|
||||
opal_list_append(&job_info, &val->super);
|
||||
}
|
||||
/* if we were asked to merge stderr to stdout, mark it so */
|
||||
if (orte_cmd_options.merge) {
|
||||
val = OBJ_NEW(opal_value_t);
|
||||
val->key = strdup(OPAL_PMIX_MERGE_STDERR_STDOUT);
|
||||
val->type = OPAL_BOOL;
|
||||
val->data.flag = true;
|
||||
opal_list_append(&job_info, &val->super);
|
||||
}
|
||||
|
||||
/* check what user wants us to do with stdin */
|
||||
if (NULL != orte_cmd_options.stdin_target) {
|
||||
val = OBJ_NEW(opal_value_t);
|
||||
val->key = strdup(OPAL_PMIX_STDIN_TGT);
|
||||
val->type = OPAL_UINT32;
|
||||
opal_list_append(&job_info, &val->super);
|
||||
if (0 == strcmp(orte_cmd_options.stdin_target, "all")) {
|
||||
val->data.uint32 = ORTE_VPID_WILDCARD;
|
||||
} else if (0 == strcmp(orte_cmd_options.stdin_target, "none")) {
|
||||
val->data.uint32 = ORTE_VPID_INVALID;
|
||||
} else {
|
||||
val->data.uint32 = strtoul(orte_cmd_options.stdin_target, NULL, 10);
|
||||
}
|
||||
}
|
||||
|
||||
/* if we want the argv's indexed, indicate that */
|
||||
if (orte_cmd_options.index_argv) {
|
||||
val = OBJ_NEW(opal_value_t);
|
||||
val->key = strdup(OPAL_PMIX_INDEX_ARGV);
|
||||
val->type = OPAL_BOOL;
|
||||
val->data.flag = true;
|
||||
opal_list_append(&job_info, &val->super);
|
||||
}
|
||||
|
||||
if (NULL != orte_cmd_options.mapping_policy) {
|
||||
val = OBJ_NEW(opal_value_t);
|
||||
val->key = strdup(OPAL_PMIX_MAPBY);
|
||||
val->type = OPAL_STRING;
|
||||
val->data.string = strdup(orte_cmd_options.mapping_policy);
|
||||
opal_list_append(&job_info, &val->super);
|
||||
} else if (orte_cmd_options.pernode) {
|
||||
val = OBJ_NEW(opal_value_t);
|
||||
val->key = strdup(OPAL_PMIX_PPR);
|
||||
val->type = OPAL_STRING;
|
||||
val->data.string = strdup("1:node");
|
||||
opal_list_append(&job_info, &val->super);
|
||||
} else if (0 < orte_cmd_options.npernode) {
|
||||
/* define the ppr */
|
||||
val = OBJ_NEW(opal_value_t);
|
||||
val->key = strdup(OPAL_PMIX_PPR);
|
||||
val->type = OPAL_STRING;
|
||||
(void)asprintf(&val->data.string, "%d:node", orte_cmd_options.npernode);
|
||||
opal_list_append(&job_info, &val->super);
|
||||
} else if (0 < orte_cmd_options.npersocket) {
|
||||
/* define the ppr */
|
||||
val = OBJ_NEW(opal_value_t);
|
||||
val->key = strdup(OPAL_PMIX_PPR);
|
||||
val->type = OPAL_STRING;
|
||||
(void)asprintf(&val->data.string, "%d:socket", orte_cmd_options.npernode);
|
||||
opal_list_append(&job_info, &val->super);
|
||||
}
|
||||
|
||||
/* if the user specified cpus/rank, set it */
|
||||
if (0 < orte_cmd_options.cpus_per_proc) {
|
||||
val = OBJ_NEW(opal_value_t);
|
||||
val->key = strdup(OPAL_PMIX_CPUS_PER_PROC);
|
||||
val->type = OPAL_UINT32;
|
||||
val->data.uint32 = orte_cmd_options.cpus_per_proc;
|
||||
opal_list_append(&job_info, &val->super);
|
||||
}
|
||||
|
||||
/* if the user specified a ranking policy, then set it */
|
||||
if (NULL != orte_cmd_options.ranking_policy) {
|
||||
val = OBJ_NEW(opal_value_t);
|
||||
val->key = strdup(OPAL_PMIX_RANKBY);
|
||||
val->type = OPAL_STRING;
|
||||
val->data.string = strdup(orte_cmd_options.ranking_policy);
|
||||
opal_list_append(&job_info, &val->super);
|
||||
}
|
||||
|
||||
/* if the user specified a binding policy, then set it */
|
||||
if (NULL != orte_cmd_options.binding_policy) {
|
||||
val = OBJ_NEW(opal_value_t);
|
||||
val->key = strdup(OPAL_PMIX_BINDTO);
|
||||
val->type = OPAL_STRING;
|
||||
val->data.string = strdup(orte_cmd_options.binding_policy);
|
||||
opal_list_append(&job_info, &val->super);
|
||||
}
|
||||
|
||||
/* if they asked for nolocal, mark it so */
|
||||
if (orte_cmd_options.nolocal) {
|
||||
val = OBJ_NEW(opal_value_t);
|
||||
val->key = strdup(OPAL_PMIX_NO_PROCS_ON_HEAD);
|
||||
val->type = OPAL_BOOL;
|
||||
val->data.flag = true;
|
||||
opal_list_append(&job_info, &val->super);
|
||||
}
|
||||
if (orte_cmd_options.no_oversubscribe) {
|
||||
val = OBJ_NEW(opal_value_t);
|
||||
val->key = strdup(OPAL_PMIX_NO_OVERSUBSCRIBE);
|
||||
val->type = OPAL_BOOL;
|
||||
val->data.flag = true;
|
||||
opal_list_append(&job_info, &val->super);
|
||||
}
|
||||
if (orte_cmd_options.oversubscribe) {
|
||||
val = OBJ_NEW(opal_value_t);
|
||||
val->key = strdup(OPAL_PMIX_NO_OVERSUBSCRIBE);
|
||||
val->type = OPAL_BOOL;
|
||||
val->data.flag = false;
|
||||
opal_list_append(&job_info, &val->super);
|
||||
}
|
||||
if (orte_cmd_options.report_bindings) {
|
||||
val = OBJ_NEW(opal_value_t);
|
||||
val->key = strdup(OPAL_PMIX_REPORT_BINDINGS);
|
||||
val->type = OPAL_BOOL;
|
||||
val->data.flag = true;
|
||||
opal_list_append(&job_info, &val->super);
|
||||
}
|
||||
if (NULL != orte_cmd_options.cpu_list) {
|
||||
val = OBJ_NEW(opal_value_t);
|
||||
val->key = strdup(OPAL_PMIX_CPU_LIST);
|
||||
val->type = OPAL_STRING;
|
||||
val->data.string = strdup(orte_cmd_options.cpu_list);
|
||||
opal_list_append(&job_info, &val->super);
|
||||
}
|
||||
|
||||
/* mark if recovery was enabled on the cmd line */
|
||||
if (orte_enable_recovery) {
|
||||
val = OBJ_NEW(opal_value_t);
|
||||
val->key = strdup(OPAL_PMIX_JOB_RECOVERABLE);
|
||||
val->type = OPAL_BOOL;
|
||||
val->data.flag = true;
|
||||
opal_list_append(&job_info, &val->super);
|
||||
}
|
||||
/* record the max restarts */
|
||||
if (0 < orte_max_restarts) {
|
||||
OPAL_LIST_FOREACH(app, &apps, opal_pmix_app_t) {
|
||||
val = OBJ_NEW(opal_value_t);
|
||||
val->key = strdup(OPAL_PMIX_MAX_RESTARTS);
|
||||
val->type = OPAL_UINT32;
|
||||
val->data.uint32 = orte_max_restarts;
|
||||
opal_list_append(&app->info, &val->super);
|
||||
}
|
||||
}
|
||||
/* if continuous operation was specified */
|
||||
if (orte_cmd_options.continuous) {
|
||||
/* mark this job as continuously operating */
|
||||
val = OBJ_NEW(opal_value_t);
|
||||
val->key = strdup(OPAL_PMIX_JOB_CONTINUOUS);
|
||||
val->type = OPAL_BOOL;
|
||||
val->data.flag = true;
|
||||
opal_list_append(&job_info, &val->super);
|
||||
}
|
||||
|
||||
if (OPAL_SUCCESS != (rc = opal_pmix.spawn(&job_info, &apps, &jobid))) {
|
||||
opal_output(0, "Job failed to spawn: %s", opal_strerror(rc));
|
||||
goto DONE;
|
||||
@ -947,7 +799,7 @@ static int create_app(int argc, char* argv[],
|
||||
val->key = strdup(OPAL_PMIX_SET_SESSION_CWD);
|
||||
val->type = OPAL_BOOL;
|
||||
val->data.flag = true;
|
||||
opal_list_append(&job_info, &val->super);
|
||||
opal_list_append(&app->info, &val->super);
|
||||
} else {
|
||||
if (OPAL_SUCCESS != (rc = opal_getcwd(cwd, sizeof(cwd)))) {
|
||||
opal_show_help("help-orterun.txt", "orterun:init-failure",
|
||||
@ -973,7 +825,7 @@ static int create_app(int argc, char* argv[],
|
||||
val->key = strdup(OPAL_PMIX_HOSTFILE);
|
||||
val->type = OPAL_STRING;
|
||||
val->data.string = value;
|
||||
opal_list_append(&job_info, &val->super);
|
||||
opal_list_append(&app->info, &val->super);
|
||||
found = true;
|
||||
}
|
||||
}
|
||||
@ -988,7 +840,7 @@ static int create_app(int argc, char* argv[],
|
||||
val->key = strdup(OPAL_PMIX_HOSTFILE);
|
||||
val->type = OPAL_STRING;
|
||||
val->data.string = value;
|
||||
opal_list_append(&job_info, &val->super);
|
||||
opal_list_append(&app->info, &val->super);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1004,7 +856,7 @@ static int create_app(int argc, char* argv[],
|
||||
val->key = strdup(OPAL_PMIX_HOST);
|
||||
val->type = OPAL_STRING;
|
||||
val->data.string = tval;
|
||||
opal_list_append(&job_info, &val->super);
|
||||
opal_list_append(&app->info, &val->super);
|
||||
}
|
||||
|
||||
/* check for bozo error */
|
||||
@ -1028,12 +880,12 @@ static int create_app(int argc, char* argv[],
|
||||
val->key = strdup(OPAL_PMIX_SET_SESSION_CWD);
|
||||
val->type = OPAL_BOOL;
|
||||
val->data.flag = true;
|
||||
opal_list_append(&job_info, &val->super);
|
||||
opal_list_append(&app->info, &val->super);
|
||||
val = OBJ_NEW(opal_value_t);
|
||||
val->key = strdup(OPAL_PMIX_PRELOAD_BIN);
|
||||
val->type = OPAL_BOOL;
|
||||
val->data.flag = true;
|
||||
opal_list_append(&job_info, &val->super);
|
||||
opal_list_append(&app->info, &val->super);
|
||||
}
|
||||
}
|
||||
if (NULL != orte_cmd_options.preload_files) {
|
||||
@ -1041,7 +893,7 @@ static int create_app(int argc, char* argv[],
|
||||
val->key = strdup(OPAL_PMIX_PRELOAD_FILES);
|
||||
val->type = OPAL_BOOL;
|
||||
val->data.flag = true;
|
||||
opal_list_append(&job_info, &val->super);
|
||||
opal_list_append(&app->info, &val->super);
|
||||
}
|
||||
|
||||
/* Do not try to find argv[0] here -- the starter is responsible
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user