1
1
openmpi/orte/tools/orte-submit/orte-submit.c

183 строки
6.0 KiB
C

/* -*- C -*-
*
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2008 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006-2014 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2007-2009 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2007-2013 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include "opal/dss/dss.h"
#include "opal/mca/event/event.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/orted/orted_submit.h"
#include "orte/runtime/orte_globals.h"
#include "orte/util/show_help.h"
/*
* Globals
*/
typedef struct {
int status;
volatile bool active;
orte_job_t *jdata;
} orte_submit_status_t;
static void launched(int index, orte_job_t *jdata, int ret, void *cbdata);
static void completed(int index, orte_job_t *jdata, int ret, void *cbdata);
static opal_cmd_line_init_t cmd_line_init[] = {
{ "orte_execute_quiet", 'q', NULL, "quiet", 0,
NULL, OPAL_CMD_LINE_TYPE_BOOL,
"Suppress helpful messages" },
{ NULL, '\0', "report-pid", "report-pid", 1,
&orte_cmd_line.report_pid, OPAL_CMD_LINE_TYPE_STRING,
"Printout pid on stdout [-], stderr [+], or a file [anything else]" },
{ NULL, '\0', "report-uri", "report-uri", 1,
&orte_cmd_line.report_uri, OPAL_CMD_LINE_TYPE_STRING,
"Printout URI on stdout [-], stderr [+], or a file [anything else]" },
/* exit status reporting */
{ "orte_report_child_jobs_separately", '\0', "report-child-jobs-separately", "report-child-jobs-separately", 0,
NULL, OPAL_CMD_LINE_TYPE_BOOL,
"Return the exit status of the primary job only" },
/* select XML output */
{ "orte_xml_output", '\0', "xml", "xml", 0,
NULL, OPAL_CMD_LINE_TYPE_BOOL,
"Provide all output in XML format" },
{ "orte_xml_file", '\0', "xml-file", "xml-file", 1,
NULL, OPAL_CMD_LINE_TYPE_STRING,
"Provide all output in XML format to the specified file" },
{ "orte_xterm", '\0', "xterm", "xterm", 1,
NULL, OPAL_CMD_LINE_TYPE_STRING,
"Create a new xterm window and display output from the specified ranks there" },
/* tell the dvm to terminate */
{ NULL, '\0', "terminate", "terminate", 0,
&orte_cmd_line.terminate_dvm, OPAL_CMD_LINE_TYPE_BOOL,
"Terminate the DVM" },
/* End of list */
{ NULL, '\0', NULL, NULL, 0,
NULL, OPAL_CMD_LINE_TYPE_NULL, NULL }
};
int main(int argc, char *argv[])
{
int rc;
orte_submit_status_t launchst, completest;
opal_cmd_line_t cmd_line;
orte_cmd_line.terminate_dvm = NULL;
/* setup our cmd line */
opal_cmd_line_create(&cmd_line, cmd_line_init);
mca_base_cmd_line_setup(&cmd_line);
/* initialize the RTE */
if (ORTE_SUCCESS != (rc = orte_submit_init(argc, argv, &cmd_line))) {
fprintf(stderr, "Init failed due to duplicate command options\n");
exit(rc);
}
/* if this is the terminate command, just send it */
if (orte_cmd_line.terminate_dvm) {
rc = orte_submit_halt();
/* just loop the event library - the errmgr
* will exit us when the connection to our
* HNP closes */
while (1) {
opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE);
}
}
/* launch whatever job we were given */
memset(&launchst, 0, sizeof(launchst));
memset(&completest, 0, sizeof(completest));
launchst.active = true;
completest.active = true;
if (ORTE_SUCCESS != (rc = orte_submit_job(argv, NULL,
launched, &launchst,
completed, &completest))) {
if (ORTE_ERR_OP_IN_PROGRESS == rc) {
/* terminate command was given */
goto waiting;
}
opal_output(0, "JOB FAILED TO LAUNCH WITH ERROR %d:%s",
rc, ORTE_ERROR_NAME(rc));
goto DONE;
}
// wait for response and unpack the status, jobid
while (launchst.active) {
opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE);
}
if (orte_debug_flag) {
opal_output(0, "Job %s has launched", ORTE_JOBID_PRINT(launchst.jdata->jobid));
}
if (ORTE_SUCCESS != launchst.status) {
goto DONE;
}
waiting:
while (completest.active) {
opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE);
}
DONE:
/* cleanup and leave */
orte_submit_finalize();
if (orte_debug_flag) {
fprintf(stderr, "exiting with status %d\n", orte_exit_status);
}
exit(orte_exit_status);
}
static void launched(int index, orte_job_t *jdata, int ret, void *cbdata)
{
orte_submit_status_t *launchst = (orte_submit_status_t*)cbdata;
launchst->status = ret;
ORTE_UPDATE_EXIT_STATUS(ret);
OBJ_RETAIN(jdata);
launchst->jdata = jdata;
launchst->active = false;
}
static void completed(int index, orte_job_t *jdata, int ret, void *cbdata)
{
orte_submit_status_t *completest = (orte_submit_status_t*)cbdata;
completest->status = ret;
ORTE_UPDATE_EXIT_STATUS(ret);
OBJ_RETAIN(jdata);
completest->jdata = jdata;
completest->active = false;
}