Complete modifications for failed-to-start of applications. Modifications for failed-to-start of orteds coming next.
This completes the minor changes required to the PLS components. Basically, there is a small change required to the parameter list of the orted cmd functions. I caught and did it for xcpu and poe, in addition to the components listed in my email - so I think that only leaves xgrid unconverted. The orted fail-to-start mods will also make changes in the PLS components, but those can be localized so they come in one at a time. This commit was SVN r14499.
Этот коммит содержится в:
родитель
a764aa6395
Коммит
18cb5c9762
@ -169,13 +169,6 @@ void orte_pls_base_recv(int status, orte_process_name_t* sender,
|
||||
break;
|
||||
|
||||
case ORTE_PLS_TERMINATE_ORTEDS_CMD:
|
||||
/* get the jobid whose daemons are to be terminated */
|
||||
count = 1;
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &job, &count, ORTE_JOBID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto SEND_ANSWER;
|
||||
}
|
||||
|
||||
/* get any attributes */
|
||||
OBJ_CONSTRUCT(&attrs, opal_list_t);
|
||||
count = 1;
|
||||
@ -199,7 +192,7 @@ void orte_pls_base_recv(int status, orte_process_name_t* sender,
|
||||
timeout.tv_usec = microsecs;
|
||||
|
||||
/* issue the command */
|
||||
if (ORTE_SUCCESS != (rc = orte_pls.terminate_orteds(job, &timeout, &attrs))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_pls.terminate_orteds(&timeout, &attrs))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
|
@ -46,7 +46,6 @@
|
||||
|
||||
#include "opal/mca/installdirs/installdirs.h"
|
||||
#include "opal/class/opal_list.h"
|
||||
#include "opal/class/opal_list.h"
|
||||
#include "opal/event/event.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "opal/util/argv.h"
|
||||
@ -72,6 +71,7 @@
|
||||
#include "orte/mca/schema/schema_types.h"
|
||||
#include "orte/mca/smr/smr.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/runtime/orte_wakeup.h"
|
||||
#include "orte/runtime/runtime.h"
|
||||
#include "orte/runtime/params.h"
|
||||
|
||||
@ -438,6 +438,10 @@ static void orte_pls_bproc_setup_env(char *** env)
|
||||
* @retval ORTE_SUCCESS
|
||||
* @retval error
|
||||
*/
|
||||
/* When working in this function, ALWAYS jump to "cleanup" if
|
||||
* you encounter an error so that orterun will be woken up and
|
||||
* the job can cleanly terminate
|
||||
*/
|
||||
static int orte_pls_bproc_launch_daemons(orte_job_map_t *map, char ***envp) {
|
||||
int * daemon_list = NULL;
|
||||
int num_daemons = 0;
|
||||
@ -452,9 +456,6 @@ static int orte_pls_bproc_launch_daemons(orte_job_map_t *map, char ***envp) {
|
||||
orte_vpid_t daemon_vpid_start;
|
||||
orte_std_cntr_t idx;
|
||||
struct stat buf;
|
||||
opal_list_t daemons;
|
||||
orte_pls_daemon_info_t *dmn;
|
||||
opal_list_item_t *item;
|
||||
struct timeval joblaunchstart, launchstart, launchstop;
|
||||
|
||||
OPAL_TRACE(1);
|
||||
@ -468,11 +469,6 @@ static int orte_pls_bproc_launch_daemons(orte_job_map_t *map, char ***envp) {
|
||||
/* indicate that the daemons have not completely launched yet */
|
||||
daemons_launched = false;
|
||||
|
||||
/* setup a list that will contain the info for all the daemons
|
||||
* so we can store it on the registry when done
|
||||
*/
|
||||
OBJ_CONSTRUCT(&daemons, opal_list_t);
|
||||
|
||||
/* get the number of nodes in this job and allocate an array for
|
||||
* their names so we can pass that to bproc - populate the list
|
||||
* with the node names
|
||||
@ -480,12 +476,12 @@ static int orte_pls_bproc_launch_daemons(orte_job_map_t *map, char ***envp) {
|
||||
num_daemons = map->num_nodes;
|
||||
if (0 == num_daemons) {
|
||||
/* nothing to do */
|
||||
OBJ_DESTRUCT(&daemons);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
if(NULL == (daemon_list = (int*)malloc(sizeof(int) * num_daemons))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
goto cleanup;
|
||||
}
|
||||
i = 0;
|
||||
@ -500,6 +496,7 @@ static int orte_pls_bproc_launch_daemons(orte_job_map_t *map, char ***envp) {
|
||||
/* allocate storage for bproc to return the daemon pids */
|
||||
if(NULL == (pids = (int*)malloc(sizeof(int) * num_daemons))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
@ -632,6 +629,10 @@ static int orte_pls_bproc_launch_daemons(orte_job_map_t *map, char ***envp) {
|
||||
rc, *pids);
|
||||
}
|
||||
|
||||
/* we need to be smarter here - right now, we stop on the first negative pid. But
|
||||
* daemons beyond that one might have started. This could leave a daemon stranded
|
||||
* when we abort
|
||||
*/
|
||||
for(i = 0; i < num_daemons; i++) {
|
||||
if(0 >= pids[i]) {
|
||||
opal_show_help("help-pls-bproc.txt", "daemon-launch-bad-pid", true,
|
||||
@ -650,28 +651,10 @@ static int orte_pls_bproc_launch_daemons(orte_job_map_t *map, char ***envp) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
dmn = OBJ_NEW(orte_pls_daemon_info_t);
|
||||
rc = orte_ns.create_process_name(&(dmn->name), ORTE_PROC_MY_NAME->cellid, 0,
|
||||
daemon_vpid_start + i);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
dmn->cell = dmn->name->cellid;
|
||||
dmn->nodename = strdup(param);
|
||||
dmn->active_job = map->job;
|
||||
opal_list_append(&daemons, &dmn->super);
|
||||
|
||||
free(param);
|
||||
}
|
||||
}
|
||||
|
||||
/* store the daemon info */
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_store_active_daemons(&daemons))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
/* setup the callbacks - this needs to be done *after* we store the
|
||||
* daemon info so that short-lived apps don't cause mpirun to
|
||||
* try and terminate the orteds before we record them
|
||||
@ -718,7 +701,6 @@ static int orte_pls_bproc_launch_daemons(orte_job_map_t *map, char ***envp) {
|
||||
}
|
||||
rc = ORTE_ERROR;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
orte_pls_bproc_terminate_job(map->job, &orte_abort_timeout, NULL);
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
@ -747,10 +729,17 @@ cleanup:
|
||||
if(NULL != orted_path) {
|
||||
free(orted_path);
|
||||
}
|
||||
while (NULL != (item = opal_list_remove_first(&daemons))) {
|
||||
OBJ_RELEASE(item);
|
||||
|
||||
/* check for failed launch - if so, force terminate */
|
||||
if (!daemons_launched) {
|
||||
if (ORTE_SUCCESS != (rc = orte_smr.set_job_state(map->job, ORTE_JOB_STATE_FAILED_TO_START))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_wakeup(map->job))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
}
|
||||
OBJ_DESTRUCT(&daemons);
|
||||
|
||||
return rc;
|
||||
}
|
||||
@ -784,7 +773,7 @@ orte_pls_bproc_node_failed(orte_gpr_notify_message_t *msg)
|
||||
orte_pls_bproc_terminate_job(job, &orte_abort_timeout, NULL);
|
||||
|
||||
/* kill the daemons */
|
||||
orte_pls_bproc_terminate_job(0, &orte_abort_timeout, NULL);
|
||||
orte_pls_bproc_terminate_orteds(&orte_abort_timeout, NULL);
|
||||
|
||||
/* shouldn't ever get here.. */
|
||||
exit(1);
|
||||
@ -806,9 +795,16 @@ orte_pls_bproc_node_failed(orte_gpr_notify_message_t *msg)
|
||||
* @retval ORTE_SUCCESS
|
||||
* @retval error
|
||||
*/
|
||||
|
||||
/* When working in this function, ALWAYS jump to "cleanup" if
|
||||
* you encounter an error so that orterun will be woken up and
|
||||
* the job can cleanly terminate. Since we don't use the ORTE
|
||||
* daemons to launch the application procs, this is the *only*
|
||||
* way we have of knowing something went wrong.
|
||||
*/
|
||||
static int orte_pls_bproc_launch_app(orte_job_map_t* map, int num_slots,
|
||||
orte_vpid_t vpid_start, int app_context) {
|
||||
int *node_array, num_nodes, cycle;
|
||||
int *node_array=NULL, num_nodes, cycle;
|
||||
int rc, i, j, stride;
|
||||
orte_std_cntr_t num_processes;
|
||||
int *pids = NULL;
|
||||
@ -817,6 +813,7 @@ static int orte_pls_bproc_launch_app(orte_job_map_t* map, int num_slots,
|
||||
struct bproc_io_t bproc_io[3];
|
||||
char **env;
|
||||
int dbg;
|
||||
bool app_launched = false;
|
||||
|
||||
OPAL_TRACE(1);
|
||||
|
||||
@ -862,7 +859,8 @@ static int orte_pls_bproc_launch_app(orte_job_map_t* map, int num_slots,
|
||||
node_array = (int*)malloc(map->num_nodes * sizeof(int));
|
||||
if (NULL == node_array) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* initialize the cycle count. Computing the process name under Bproc
|
||||
@ -949,6 +947,10 @@ static int orte_pls_bproc_launch_app(orte_job_map_t* map, int num_slots,
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* we need to be smarter here - right now, we stop on the first negative pid. But
|
||||
* processes beyond that one might have started. This leaves those procs stranded
|
||||
* when we abort
|
||||
*/
|
||||
for(j = 0; j < num_nodes; j++) {
|
||||
if(0 >= pids[j]) {
|
||||
opal_show_help("help-pls-bproc.txt", "proc-launch-bad-pid", true,
|
||||
@ -1007,15 +1009,34 @@ static int orte_pls_bproc_launch_app(orte_job_map_t* map, int num_slots,
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* get here if the app procs launched cleanly */
|
||||
apps_launched = true;
|
||||
|
||||
cleanup:
|
||||
if(NULL != pids) {
|
||||
free(pids);
|
||||
}
|
||||
|
||||
free(node_array);
|
||||
if (NULL != node_array) {
|
||||
free(node_array);
|
||||
}
|
||||
|
||||
if (NULL != env) {
|
||||
opal_argv_free(env);
|
||||
}
|
||||
|
||||
/* check for failed launch - if so, force terminate */
|
||||
if (!apps_launched) {
|
||||
if (ORTE_SUCCESS != (rc = orte_smr.set_job_state(map->job, ORTE_JOB_STATE_FAILED_TO_START))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_wakeup(map->job))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
}
|
||||
|
||||
if (NULL != env) opal_argv_free(env);
|
||||
return rc;
|
||||
}
|
||||
|
||||
@ -1032,8 +1053,13 @@ cleanup:
|
||||
* @retval ORTE_SUCCESS
|
||||
* @retval error
|
||||
*/
|
||||
|
||||
/* When working in this function, ALWAYS jump to "cleanup" if
|
||||
* you encounter an error so that orterun will be woken up and
|
||||
* the job can cleanly terminate
|
||||
*/
|
||||
int orte_pls_bproc_launch(orte_jobid_t jobid) {
|
||||
orte_job_map_t* map;
|
||||
orte_job_map_t* map = NULL;
|
||||
orte_mapped_node_t *map_node;
|
||||
orte_vpid_t vpid_launch;
|
||||
int rc;
|
||||
@ -1043,26 +1069,31 @@ int orte_pls_bproc_launch(orte_jobid_t jobid) {
|
||||
char cwd_save[OMPI_PATH_MAX + 1];
|
||||
orte_ras_node_t *ras_node;
|
||||
char **daemon_env;
|
||||
bool launched;
|
||||
|
||||
OPAL_TRACE(1);
|
||||
|
||||
/* indicate the launch condition */
|
||||
launched = false;
|
||||
|
||||
/* make sure the pls_bproc receive function has been started */
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_bproc_comm_start())) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* save the current working directory */
|
||||
if (NULL == getcwd(cwd_save, sizeof(cwd_save))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
rc = ORTE_ERR_NOT_FOUND;
|
||||
goto cleanup;
|
||||
}
|
||||
cwd_save[sizeof(cwd_save) - 1] = '\0';
|
||||
|
||||
/* get the job map */
|
||||
if(ORTE_SUCCESS != (rc = orte_rmaps.get_job_map(&map, jobid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* set the mapping mode */
|
||||
@ -1158,16 +1189,32 @@ int orte_pls_bproc_launch(orte_jobid_t jobid) {
|
||||
vpid_launch += map->apps[context]->num_procs;
|
||||
}
|
||||
|
||||
/* indicate a successful launch */
|
||||
launched = true;
|
||||
|
||||
cleanup:
|
||||
chdir(cwd_save);
|
||||
|
||||
OBJ_RELEASE(map);
|
||||
if (NULL != map) {
|
||||
OBJ_RELEASE(map);
|
||||
}
|
||||
|
||||
if (mca_pls_bproc_component.do_not_launch) {
|
||||
/* indicate that we failed to launch, but do so silently */
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
|
||||
/* check for failed launch - if so, force terminate */
|
||||
if (!launched) {
|
||||
if (ORTE_SUCCESS != (rc = orte_smr.set_job_state(jobid, ORTE_JOB_STATE_FAILED_TO_START))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_wakeup(jobid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
@ -1203,17 +1250,15 @@ int orte_pls_bproc_terminate_job(orte_jobid_t jobid, struct timeval *timeout, op
|
||||
/**
|
||||
* Terminate the orteds for a given job
|
||||
*/
|
||||
int orte_pls_bproc_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs)
|
||||
int orte_pls_bproc_terminate_orteds(struct timeval *timeout, opal_list_t *attrs)
|
||||
{
|
||||
int rc;
|
||||
|
||||
OPAL_TRACE(1);
|
||||
|
||||
/* now tell them to die! */
|
||||
/* tell them to die! */
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_exit(timeout, attrs))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
@ -43,7 +43,7 @@
|
||||
|
||||
static int orte_pls_cnos_launch_job(orte_jobid_t jobid);
|
||||
static int orte_pls_cnos_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs);
|
||||
static int orte_pls_cnos_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs);
|
||||
static int orte_pls_cnos_terminate_orteds(struct timeval *timeout, opal_list_t *attrs);
|
||||
static int orte_pls_cnos_terminate_proc(const orte_process_name_t* proc_name);
|
||||
static int orte_pls_cnos_signal_job(orte_jobid_t jobid, int32_t signal, opal_list_t *attrs);
|
||||
static int orte_pls_cnos_signal_proc(const orte_process_name_t* proc_name, int32_t signal);
|
||||
@ -91,18 +91,13 @@ static int orte_pls_cnos_terminate_job(orte_jobid_t jobid, struct timeval *timeo
|
||||
}
|
||||
|
||||
|
||||
static int orte_pls_cnos_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs)
|
||||
static int orte_pls_cnos_terminate_orteds(struct timeval *timeout, opal_list_t *attrs)
|
||||
{
|
||||
orte_jobid_t my_jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
|
||||
/* make sure it's my job */
|
||||
if (jobid == my_jobid) {
|
||||
#ifdef HAVE_KILLRANK
|
||||
killrank(-1, SIGKILL);
|
||||
killrank(-1, SIGKILL);
|
||||
#else
|
||||
exit(0);
|
||||
exit(0);
|
||||
#endif
|
||||
}
|
||||
|
||||
return ORTE_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
|
@ -207,7 +207,7 @@ typedef int (*orte_pls_base_module_terminate_job_fn_t)(orte_jobid_t, struct time
|
||||
/**
|
||||
* Terminate the daemons associated with this jobid
|
||||
*/
|
||||
typedef int (*orte_pls_base_module_terminate_orteds_fn_t)(orte_jobid_t, struct timeval *timeout, opal_list_t *attrs);
|
||||
typedef int (*orte_pls_base_module_terminate_orteds_fn_t)(struct timeval *timeout, opal_list_t *attrs);
|
||||
|
||||
/**
|
||||
* Terminate a specific process.
|
||||
|
@ -64,7 +64,7 @@ extern char **environ;
|
||||
*/
|
||||
static int pls_poe_launch_job(orte_jobid_t jobid);
|
||||
static int pls_poe_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs);
|
||||
static int pls_poe_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs);
|
||||
static int pls_poe_terminate_orteds(struct timeval *timeout, opal_list_t *attrs);
|
||||
static int pls_poe_terminate_proc(const orte_process_name_t *name);
|
||||
static int pls_poe_signal_job(orte_jobid_t jobid, int32_t signal, opal_list_t *attrs);
|
||||
static int pls_poe_signal_proc(const orte_process_name_t *name, int32_t signal);
|
||||
@ -477,7 +477,8 @@ static inline int poe_launch_interactive_job(orte_jobid_t jobid)
|
||||
fclose(hfp);
|
||||
}
|
||||
|
||||
rc = orte_rmgr.get_vpid_range(jobid, &vpid_start, &vpid_range);
|
||||
vpid_start = 0;
|
||||
rc = orte_ns.get_vpid_range(jobid, &vpid_range);
|
||||
if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); goto cleanup; }
|
||||
|
||||
/* Create a temporary POE command file */
|
||||
@ -589,7 +590,7 @@ static int pls_poe_terminate_proc(const orte_process_name_t *name)
|
||||
return ORTE_ERR_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
static int pls_poe_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs)
|
||||
static int pls_poe_terminate_orteds(struct timeval *timeout, opal_list_t *attrs)
|
||||
{
|
||||
return ORTE_ERR_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
@ -201,7 +201,7 @@ int orte_pls_proxy_terminate_job(orte_jobid_t job, struct timeval *timeout, opal
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_pls_proxy_terminate_orteds(orte_jobid_t job, struct timeval *timeout, opal_list_t *attrs)
|
||||
int orte_pls_proxy_terminate_orteds(struct timeval *timeout, opal_list_t *attrs)
|
||||
{
|
||||
orte_buffer_t* cmd;
|
||||
orte_buffer_t* answer;
|
||||
@ -226,12 +226,6 @@ int orte_pls_proxy_terminate_orteds(orte_jobid_t job, struct timeval *timeout, o
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &job, 1, ORTE_JOBID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(cmd);
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, attrs, 1, ORTE_ATTR_LIST))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(cmd);
|
||||
|
@ -53,7 +53,7 @@ int orte_pls_proxy_finalize(void);
|
||||
*/
|
||||
int orte_pls_proxy_launch(orte_jobid_t job);
|
||||
int orte_pls_proxy_terminate_job(orte_jobid_t job, struct timeval *timeout, opal_list_t *attrs);
|
||||
int orte_pls_proxy_terminate_orteds(orte_jobid_t job, struct timeval *timeout, opal_list_t *attrs);
|
||||
int orte_pls_proxy_terminate_orteds(struct timeval *timeout, opal_list_t *attrs);
|
||||
int orte_pls_proxy_terminate_proc(const orte_process_name_t* name);
|
||||
int orte_pls_proxy_signal_job(orte_jobid_t job, int32_t signal, opal_list_t *attrs);
|
||||
int orte_pls_proxy_signal_proc(const orte_process_name_t* name, int32_t signal);
|
||||
|
@ -55,7 +55,7 @@ int orte_pls_rsh_finalize(void);
|
||||
*/
|
||||
int orte_pls_rsh_launch(orte_jobid_t);
|
||||
int orte_pls_rsh_terminate_job(orte_jobid_t, struct timeval *timeout, opal_list_t*);
|
||||
int orte_pls_rsh_terminate_orteds(orte_jobid_t, struct timeval *timeout, opal_list_t*);
|
||||
int orte_pls_rsh_terminate_orteds(struct timeval *timeout, opal_list_t*);
|
||||
int orte_pls_rsh_terminate_proc(const orte_process_name_t* proc_name);
|
||||
int orte_pls_rsh_signal_job(orte_jobid_t, int32_t, opal_list_t*);
|
||||
int orte_pls_rsh_signal_proc(const orte_process_name_t* proc_name, int32_t);
|
||||
|
@ -1071,7 +1071,7 @@ int orte_pls_rsh_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal
|
||||
/**
|
||||
* Terminate the orteds for a given job
|
||||
*/
|
||||
int orte_pls_rsh_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs)
|
||||
int orte_pls_rsh_terminate_orteds(struct timeval *timeout, opal_list_t *attrs)
|
||||
{
|
||||
int rc;
|
||||
|
||||
|
@ -75,7 +75,7 @@
|
||||
*/
|
||||
static int pls_slurm_launch_job(orte_jobid_t jobid);
|
||||
static int pls_slurm_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs);
|
||||
static int pls_slurm_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs);
|
||||
static int pls_slurm_terminate_orteds(struct timeval *timeout, opal_list_t *attrs);
|
||||
static int pls_slurm_terminate_proc(const orte_process_name_t *name);
|
||||
static int pls_slurm_signal_job(orte_jobid_t jobid, int32_t signal, opal_list_t *attrs);
|
||||
static int pls_slurm_signal_proc(const orte_process_name_t *name, int32_t signal);
|
||||
@ -101,9 +101,10 @@ orte_pls_base_module_1_3_0_t orte_pls_slurm_module = {
|
||||
};
|
||||
|
||||
/*
|
||||
* Local variable
|
||||
* Local variables
|
||||
*/
|
||||
static pid_t srun_pid = 0;
|
||||
static orte_jobid_t active_job = ORTE_JOBID_INVALID;
|
||||
|
||||
|
||||
/*
|
||||
@ -113,16 +114,19 @@ static pid_t srun_pid = 0;
|
||||
extern char **environ;
|
||||
#endif /* !defined(__WINDOWS__) */
|
||||
|
||||
/* When working in this function, ALWAYS jump to "cleanup" if
|
||||
* you encounter an error so that orterun will be woken up and
|
||||
* the job can cleanly terminate
|
||||
*/
|
||||
static int pls_slurm_launch_job(orte_jobid_t jobid)
|
||||
{
|
||||
orte_job_map_t *map;
|
||||
orte_job_map_t *map = NULL;
|
||||
opal_list_item_t *item;
|
||||
size_t num_nodes;
|
||||
orte_vpid_t vpid;
|
||||
orte_vpid_t start_vpid;
|
||||
char *jobid_string = NULL;
|
||||
char *param;
|
||||
char **argv;
|
||||
char **argv = NULL;
|
||||
int argc;
|
||||
int rc;
|
||||
char *tmp;
|
||||
@ -136,10 +140,9 @@ static int pls_slurm_launch_job(orte_jobid_t jobid)
|
||||
char **custom_strings;
|
||||
int num_args, i;
|
||||
char *cur_prefix;
|
||||
opal_list_t daemons;
|
||||
orte_pls_daemon_info_t *dmn;
|
||||
struct timeval joblaunchstart, launchstart, launchstop;
|
||||
int proc_name_index = 0;
|
||||
bool failed_launch = true;
|
||||
|
||||
if (mca_pls_slurm_component.timing) {
|
||||
if (0 != gettimeofday(&joblaunchstart, NULL)) {
|
||||
@ -147,10 +150,8 @@ static int pls_slurm_launch_job(orte_jobid_t jobid)
|
||||
}
|
||||
}
|
||||
|
||||
/* setup a list that will contain the info for all the daemons
|
||||
* so we can store it on the registry when done
|
||||
*/
|
||||
OBJ_CONSTRUCT(&daemons, opal_list_t);
|
||||
/* save the active jobid */
|
||||
active_job = jobid;
|
||||
|
||||
/* Query the map for this job.
|
||||
* We need the entire mapping for a couple of reasons:
|
||||
@ -161,8 +162,7 @@ static int pls_slurm_launch_job(orte_jobid_t jobid)
|
||||
rc = orte_rmaps.get_job_map(&map, jobid);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&daemons);
|
||||
return rc;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* if the user requested that we re-use daemons,
|
||||
@ -171,9 +171,7 @@ static int pls_slurm_launch_job(orte_jobid_t jobid)
|
||||
if (orte_pls_base.reuse_daemons) {
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_launch_on_existing_daemons(map))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(map);
|
||||
OBJ_DESTRUCT(&daemons);
|
||||
return rc;
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
|
||||
@ -186,14 +184,13 @@ static int pls_slurm_launch_job(orte_jobid_t jobid)
|
||||
* on existing daemons, so we can just return
|
||||
*/
|
||||
OBJ_RELEASE(map);
|
||||
OBJ_DESTRUCT(&daemons);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
rc = orte_ns.reserve_range(0, num_nodes, &vpid);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
start_vpid = vpid;
|
||||
|
||||
/* setup the orted triggers for passing their launch info */
|
||||
if (ORTE_SUCCESS != (rc = orte_smr.init_orted_stage_gates(jobid, num_nodes, NULL, NULL))) {
|
||||
@ -332,31 +329,6 @@ static int pls_slurm_launch_job(orte_jobid_t jobid)
|
||||
}
|
||||
}
|
||||
|
||||
/* setup the daemon info for each node */
|
||||
vpid = start_vpid;
|
||||
for (item = opal_list_get_first(&map->nodes);
|
||||
item != opal_list_get_end(&map->nodes);
|
||||
item = opal_list_get_next(item)) {
|
||||
orte_mapped_node_t* node = (orte_mapped_node_t*)item;
|
||||
|
||||
/* record the daemons info for this node */
|
||||
dmn = OBJ_NEW(orte_pls_daemon_info_t);
|
||||
dmn->active_job = jobid;
|
||||
dmn->cell = node->cell;
|
||||
dmn->nodename = strdup(node->nodename);
|
||||
if (ORTE_SUCCESS != (rc = orte_ns.create_process_name(&(dmn->name), dmn->cell, 0, vpid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
opal_list_append(&daemons, &dmn->super);
|
||||
vpid++;
|
||||
}
|
||||
|
||||
/* store the daemon info on the registry */
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_store_active_daemons(&daemons))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
/* setup environment */
|
||||
env = opal_argv_copy(environ);
|
||||
var = mca_base_param_environ_variable("seed", NULL, NULL);
|
||||
@ -374,7 +346,19 @@ static int pls_slurm_launch_job(orte_jobid_t jobid)
|
||||
}
|
||||
|
||||
/* exec the daemon */
|
||||
rc = pls_slurm_start_proc(argc, argv, env, cur_prefix);
|
||||
if (ORTE_SUCCESS != (rc = pls_slurm_start_proc(argc, argv, env, cur_prefix))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* do NOT wait for srun to complete. Srun only completes when the processes
|
||||
* it starts - in this case, the orteds - complete. We need to go ahead and
|
||||
* return so orterun can do the rest of its stuff. Instead, we'll catch
|
||||
* any srun failures and deal with them elsewhere
|
||||
*/
|
||||
|
||||
/* declare the launch a success */
|
||||
failed_launch = false;
|
||||
|
||||
if (mca_pls_slurm_component.timing) {
|
||||
if (0 != gettimeofday(&launchstop, NULL)) {
|
||||
@ -395,21 +379,32 @@ static int pls_slurm_launch_job(orte_jobid_t jobid)
|
||||
}
|
||||
|
||||
/* JMS: short we stash the srun pid in the gpr somewhere for cleanup? */
|
||||
/* JMS: how do we catch when srun dies? */
|
||||
|
||||
cleanup:
|
||||
OBJ_RELEASE(map);
|
||||
opal_argv_free(argv);
|
||||
opal_argv_free(env);
|
||||
|
||||
if (NULL != map) {
|
||||
OBJ_RELEASE(map);
|
||||
}
|
||||
if (NULL != argv) {
|
||||
opal_argv_free(argv);
|
||||
}
|
||||
if (NULL != env) {
|
||||
opal_argv_free(env);
|
||||
}
|
||||
|
||||
if(NULL != jobid_string) {
|
||||
free(jobid_string);
|
||||
}
|
||||
|
||||
while (NULL != (item = opal_list_remove_first(&daemons))) {
|
||||
OBJ_RELEASE(item);
|
||||
/* check for failed launch - if so, force terminate */
|
||||
if (failed_launch) {
|
||||
if (ORTE_SUCCESS != (rc = orte_smr.set_job_state(jobid, ORTE_JOB_STATE_FAILED_TO_START))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_wakeup(jobid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
}
|
||||
OBJ_DESTRUCT(&daemons);
|
||||
|
||||
return rc;
|
||||
}
|
||||
@ -431,11 +426,18 @@ static int pls_slurm_terminate_job(orte_jobid_t jobid, struct timeval *timeout,
|
||||
/**
|
||||
* Terminate the orteds for a given job
|
||||
*/
|
||||
static int pls_slurm_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs)
|
||||
static int pls_slurm_terminate_orteds(struct timeval *timeout, opal_list_t *attrs)
|
||||
{
|
||||
int rc;
|
||||
|
||||
/* order them to go away */
|
||||
|
||||
/* deregister the waitpid callback to ensure we don't make it look like
|
||||
* srun failed when it didn't. Since the srun may have already completed,
|
||||
* do NOT ERROR_LOG any return code to avoid confusing, duplicate error
|
||||
* messages
|
||||
*/
|
||||
orte_wait_cb_cancel(srun_pid);
|
||||
|
||||
/* tell them to die! */
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_exit(timeout, attrs))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
@ -495,7 +497,7 @@ static int pls_slurm_cancel_operation(void)
|
||||
static int pls_slurm_finalize(void)
|
||||
{
|
||||
int rc;
|
||||
|
||||
|
||||
/* cleanup any pending recvs */
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_comm_stop())) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -505,6 +507,46 @@ static int pls_slurm_finalize(void)
|
||||
}
|
||||
|
||||
|
||||
static void srun_wait_cb(pid_t pid, int status, void* cbdata){
|
||||
/* According to the SLURM folks, srun always returns the highest exit
|
||||
code of our remote processes. Thus, a non-zero exit status doesn't
|
||||
necessarily mean that srun failed - it could be that an orted returned
|
||||
a non-zero exit status. Of course, that means the orted failed(!), so
|
||||
the end result is the same - the job didn't start.
|
||||
|
||||
As a result, we really can't do much with the exit status itself - it
|
||||
could be something in errno (if srun itself failed), or it could be
|
||||
something returned by an orted, or it could be something returned by
|
||||
the OS (e.g., couldn't find the orted binary). Somebody is welcome
|
||||
to sort out all the options and pretty-print a better error message. For
|
||||
now, though, the only thing that really matters is that
|
||||
srun failed. Report the error and make sure that orterun
|
||||
wakes up - otherwise, do nothing!
|
||||
*/
|
||||
|
||||
int rc;
|
||||
|
||||
if (0 != status) {
|
||||
/* we have a problem */
|
||||
opal_output(0, "ERROR: srun failed to start the required daemons.");
|
||||
opal_output(0, "ERROR: This could be due to an inability to find the orted binary");
|
||||
opal_output(0, "ERROR: on one or more remote nodes, lack of authority to execute");
|
||||
opal_output(0, "ERROR: on one or more specified nodes, or other factors.");
|
||||
|
||||
/* set the job state so we know it failed to start */
|
||||
if (ORTE_SUCCESS != (rc = orte_smr.set_job_state(active_job, ORTE_JOB_STATE_FAILED_TO_START))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
/* force termination of the job */
|
||||
if (ORTE_SUCCESS != (rc = orte_wakeup(active_job))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
static int pls_slurm_start_proc(int argc, char **argv, char **env,
|
||||
char *prefix)
|
||||
{
|
||||
@ -517,9 +559,11 @@ static int pls_slurm_start_proc(int argc, char **argv, char **env,
|
||||
|
||||
srun_pid = fork();
|
||||
if (-1 == srun_pid) {
|
||||
opal_output(0, "pls:slurm:start_proc: fork failed");
|
||||
return ORTE_ERR_IN_ERRNO;
|
||||
} else if (0 == srun_pid) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_CHILDREN);
|
||||
return ORTE_ERR_SYS_LIMITS_CHILDREN;
|
||||
}
|
||||
|
||||
if (0 == srun_pid) { /* child */
|
||||
char *bin_base = NULL, *lib_base = NULL;
|
||||
|
||||
/* Figure out the basenames for the libdir and bindir. There
|
||||
@ -596,14 +640,16 @@ static int pls_slurm_start_proc(int argc, char **argv, char **env,
|
||||
/* don't return - need to exit - returning would be bad -
|
||||
we're not in the calling process anymore */
|
||||
exit(1);
|
||||
} else { /* parent */
|
||||
/* just in case, make sure that the srun process is not in our
|
||||
process group any more. Stevens says always do this on both
|
||||
sides of the fork... */
|
||||
setpgid(srun_pid, srun_pid);
|
||||
|
||||
/* setup the waitpid so we can find out if srun succeeds! */
|
||||
orte_wait_cb(srun_pid, srun_wait_cb, NULL);
|
||||
free(exec_argv);
|
||||
}
|
||||
|
||||
free(exec_argv);
|
||||
|
||||
/* just in case, make sure that the srun process is not in our
|
||||
process group any more. Stevens says always do this on both
|
||||
sides of the fork... */
|
||||
setpgid(srun_pid, srun_pid);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
@ -25,20 +25,17 @@ The first two prefix values supplied for node %s were:
|
||||
%s
|
||||
and %s
|
||||
#
|
||||
[daemon-not-found]
|
||||
The TM (PBS / Torqus) process starter in Open MPI was unable to find
|
||||
its daemon executable (orted) on the node where mpirun was executed.
|
||||
[tm-spawn-failed]
|
||||
The TM (PBS / Torque) process starter failed to spawn a daemon (orted)
|
||||
on a remote node.
|
||||
|
||||
This sanity check is performed because the back-end PBS / Torque
|
||||
process launcher does not provide any kind of error to Open MPI if it
|
||||
tries to launch its daemon on a remote node, but the daemon cannot be
|
||||
found. Open MPI's check for the daemon locally is somewhat of a lame
|
||||
workaround / sanity check.
|
||||
Command line: %s
|
||||
Node name: %s
|
||||
Launch id: %d
|
||||
|
||||
If you do not understand this error mesage, please try the following:
|
||||
|
||||
1. Try to add the Open MPI executables to your PATH
|
||||
2. Use the --prefix option to mpirun to indicate where Open MPI can
|
||||
find its executables
|
||||
3. Set the MCA parameter "pls_tm_want_path_check" to 0
|
||||
4. Talk to your local system administration
|
||||
1. Ensure that the executable "orted" is in your PATH
|
||||
2. Use the --prefix option to indicate where we can
|
||||
find that executable
|
||||
3. Talk to your local system administrator
|
||||
|
@ -60,6 +60,7 @@
|
||||
#include "orte/orte_types.h"
|
||||
#include "orte/runtime/runtime.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/runtime/orte_wakeup.h"
|
||||
#include "orte/mca/pls/pls.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/smr/smr.h"
|
||||
@ -80,7 +81,7 @@
|
||||
*/
|
||||
static int pls_tm_launch_job(orte_jobid_t jobid);
|
||||
static int pls_tm_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs);
|
||||
static int pls_tm_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs);
|
||||
static int pls_tm_terminate_orteds(struct timeval *timeout, opal_list_t *attrs);
|
||||
static int pls_tm_terminate_proc(const orte_process_name_t *name);
|
||||
static int pls_tm_signal_job(orte_jobid_t jobid, int32_t signal, opal_list_t *attrs);
|
||||
static int pls_tm_signal_proc(const orte_process_name_t *name, int32_t signal);
|
||||
@ -89,7 +90,6 @@ static int pls_tm_finalize(void);
|
||||
|
||||
static int pls_tm_connect(void);
|
||||
static int pls_tm_disconnect(void);
|
||||
static int pls_tm_check_path(char *exe, char **env);
|
||||
|
||||
/*
|
||||
* Local variables
|
||||
@ -114,19 +114,23 @@ orte_pls_base_module_t orte_pls_tm_module = {
|
||||
extern char **environ;
|
||||
#endif /* !defined(__WINDOWS__) */
|
||||
|
||||
/* When working in this function, ALWAYS jump to "cleanup" if
|
||||
* you encounter an error so that orterun will be woken up and
|
||||
* the job can cleanly terminate
|
||||
*/
|
||||
static int pls_tm_launch_job(orte_jobid_t jobid)
|
||||
{
|
||||
orte_job_map_t *map;
|
||||
orte_job_map_t *map = NULL;
|
||||
opal_list_item_t *item;
|
||||
size_t num_nodes;
|
||||
orte_vpid_t vpid;
|
||||
int node_name_index;
|
||||
int proc_name_index;
|
||||
char *jobid_string;
|
||||
char *param;
|
||||
char **env;
|
||||
char *uri, *param;
|
||||
char **env = NULL;
|
||||
char *var;
|
||||
char **argv;
|
||||
char **argv = NULL;
|
||||
int argc;
|
||||
int rc;
|
||||
bool connected = false;
|
||||
@ -136,12 +140,11 @@ static int pls_tm_launch_job(orte_jobid_t jobid)
|
||||
tm_task_id *tm_task_ids = NULL;
|
||||
int local_err;
|
||||
tm_event_t event;
|
||||
opal_list_t daemons;
|
||||
orte_pls_daemon_info_t *dmn;
|
||||
struct timeval launchstart, launchstop, completionstart, completionstop;
|
||||
struct timeval jobstart, jobstop;
|
||||
int maxtime=0, mintime=99999999, maxiter = 0, miniter = 0, deltat;
|
||||
float avgtime=0.0;
|
||||
bool failed_launch = true;
|
||||
|
||||
/* check for timing request - get start time if so */
|
||||
if (mca_pls_tm_component.timing) {
|
||||
@ -158,7 +161,7 @@ static int pls_tm_launch_job(orte_jobid_t jobid)
|
||||
rc = orte_rmaps.get_job_map(&map, jobid);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* if the user requested that we re-use daemons,
|
||||
@ -167,8 +170,7 @@ static int pls_tm_launch_job(orte_jobid_t jobid)
|
||||
if (orte_pls_base.reuse_daemons) {
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_launch_on_existing_daemons(map))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(map);
|
||||
return rc;
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
|
||||
@ -184,6 +186,7 @@ static int pls_tm_launch_job(orte_jobid_t jobid)
|
||||
*/
|
||||
rc = orte_ns.reserve_range(0, num_nodes, &vpid);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
@ -193,20 +196,17 @@ static int pls_tm_launch_job(orte_jobid_t jobid)
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* setup a list that will contain the info for all the daemons
|
||||
* so we can store it on the registry when done
|
||||
*/
|
||||
OBJ_CONSTRUCT(&daemons, opal_list_t);
|
||||
|
||||
/* Allocate a bunch of TM events to use for tm_spawn()ing */
|
||||
tm_events = malloc(sizeof(tm_event_t) * num_nodes);
|
||||
if (NULL == tm_events) {
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
tm_task_ids = malloc(sizeof(tm_task_id) * num_nodes);
|
||||
if (NULL == tm_task_ids) {
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
@ -294,17 +294,6 @@ static int pls_tm_launch_job(orte_jobid_t jobid)
|
||||
}
|
||||
}
|
||||
|
||||
/* Do a quick sanity check to ensure that we can find the
|
||||
orted in the PATH */
|
||||
|
||||
if (ORTE_SUCCESS !=
|
||||
(rc = pls_tm_check_path(argv[0], env))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
opal_show_help("help-pls-tm.txt", "daemon-not-found",
|
||||
true, argv[0]);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* Iterate through each of the nodes and spin
|
||||
* up a daemon.
|
||||
*/
|
||||
@ -315,19 +304,10 @@ static int pls_tm_launch_job(orte_jobid_t jobid)
|
||||
orte_process_name_t* name;
|
||||
char* name_string;
|
||||
|
||||
/* new daemon - setup to record its info */
|
||||
dmn = OBJ_NEW(orte_pls_daemon_info_t);
|
||||
dmn->active_job = jobid;
|
||||
opal_list_append(&daemons, &dmn->super);
|
||||
|
||||
/* setup node name */
|
||||
free(argv[node_name_index]);
|
||||
argv[node_name_index] = strdup(node->nodename);
|
||||
|
||||
/* record the node name in the daemon struct */
|
||||
dmn->cell = node->cell;
|
||||
dmn->nodename = strdup(node->nodename);
|
||||
|
||||
/* initialize daemons process name */
|
||||
rc = orte_ns.create_process_name(&name, node->cell, 0, vpid);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
@ -335,12 +315,6 @@ static int pls_tm_launch_job(orte_jobid_t jobid)
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* save it in the daemon struct */
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.copy((void**)&(dmn->name), name, ORTE_NAME))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* setup per-node options */
|
||||
if (mca_pls_tm_component.debug ||
|
||||
mca_pls_tm_component.verbose) {
|
||||
@ -352,7 +326,7 @@ static int pls_tm_launch_job(orte_jobid_t jobid)
|
||||
rc = orte_ns.get_proc_name_string(&name_string, name);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
opal_output(0, "pls:tm: unable to create process name");
|
||||
return rc;
|
||||
goto cleanup;
|
||||
}
|
||||
free(argv[proc_name_index]);
|
||||
argv[proc_name_index] = strdup(name_string);
|
||||
@ -377,13 +351,12 @@ static int pls_tm_launch_job(orte_jobid_t jobid)
|
||||
|
||||
rc = tm_spawn(argc, argv, env, node->launch_id, tm_task_ids + launched, tm_events + launched);
|
||||
if (TM_SUCCESS != rc) {
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
opal_output(0, "pls:tm: start_procs returned error %d", rc);
|
||||
opal_show_help("help-pls-tm.txt", "tm-spawn-failed",
|
||||
true, argv[0], node->nodename, node->launch_id);
|
||||
rc = ORTE_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* check for timing request - get stop time and process if so */
|
||||
if (mca_pls_tm_component.timing) {
|
||||
if (0 != gettimeofday(&launchstop, NULL)) {
|
||||
@ -423,21 +396,19 @@ static int pls_tm_launch_job(orte_jobid_t jobid)
|
||||
}
|
||||
}
|
||||
|
||||
/* all done, so store the daemon info on the registry */
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_store_active_daemons(&daemons))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
/* TM poll for all the spawns */
|
||||
for (i = 0; i < launched; ++i) {
|
||||
rc = tm_poll(TM_NULL_EVENT, &event, 1, &local_err);
|
||||
if (TM_SUCCESS != rc) {
|
||||
errno = local_err;
|
||||
opal_output(0, "pls:tm: failed to poll for a spawned proc, return status = %d", rc);
|
||||
return ORTE_ERR_IN_ERRNO;
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
|
||||
/* if we get here, then everything launched okay - record that fact */
|
||||
failed_launch = false;
|
||||
|
||||
/* check for timing request - get stop time for launch completion and report */
|
||||
if (mca_pls_tm_component.timing) {
|
||||
if (0 != gettimeofday(&completionstop, NULL)) {
|
||||
@ -455,7 +426,15 @@ static int pls_tm_launch_job(orte_jobid_t jobid)
|
||||
|
||||
|
||||
cleanup:
|
||||
OBJ_RELEASE(map);
|
||||
if (NULL != map) {
|
||||
OBJ_RELEASE(map);
|
||||
}
|
||||
if (NULL != argv) {
|
||||
opal_argv_free(argv);
|
||||
}
|
||||
if (NULL != env) {
|
||||
opal_argv_free(env);
|
||||
}
|
||||
|
||||
if (connected) {
|
||||
pls_tm_disconnect();
|
||||
@ -474,12 +453,17 @@ static int pls_tm_launch_job(orte_jobid_t jobid)
|
||||
free(bin_base);
|
||||
}
|
||||
|
||||
/* deconstruct the daemon list */
|
||||
while (NULL != (item = opal_list_remove_first(&daemons))) {
|
||||
OBJ_RELEASE(item);
|
||||
/* check for failed launch - if so, force terminate */
|
||||
if (failed_launch) {
|
||||
if (ORTE_SUCCESS != (rc = orte_smr.set_job_state(jobid, ORTE_JOB_STATE_FAILED_TO_START))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_wakeup(jobid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
}
|
||||
OBJ_DESTRUCT(&daemons);
|
||||
|
||||
|
||||
/* check for timing request - get stop time and process if so */
|
||||
if (mca_pls_tm_component.timing) {
|
||||
if (0 != gettimeofday(&jobstop, NULL)) {
|
||||
@ -502,11 +486,11 @@ static int pls_tm_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opa
|
||||
{
|
||||
int rc;
|
||||
|
||||
/* order them to kill their local procs for this job */
|
||||
/* order all of the daemons to kill their local procs for this job */
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_kill_local_procs(jobid, timeout, attrs))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
@ -514,7 +498,7 @@ static int pls_tm_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opa
|
||||
/**
|
||||
* Terminate the orteds for a given job
|
||||
*/
|
||||
int pls_tm_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs)
|
||||
int pls_tm_terminate_orteds(struct timeval *timeout, opal_list_t *attrs)
|
||||
{
|
||||
int rc;
|
||||
|
||||
@ -620,83 +604,3 @@ static int pls_tm_disconnect(void)
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static int pls_tm_check_path(char *exe, char **env)
|
||||
{
|
||||
static int size = 256;
|
||||
int i;
|
||||
char *file;
|
||||
char *cwd;
|
||||
char *path = NULL;
|
||||
|
||||
/* Do we want this check at all? */
|
||||
|
||||
if (!mca_pls_tm_component.want_path_check) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* Find the path in the supplied environment */
|
||||
|
||||
for (i = 0; NULL != env[i]; ++i) {
|
||||
if (0 == strncmp("PATH=", env[i], 5)) {
|
||||
path = strdup(env[i]);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (NULL == env[i]) {
|
||||
path = strdup("NULL");
|
||||
}
|
||||
|
||||
/* Check the already-successful paths (i.e., be a little
|
||||
friendlier to the filesystem -- if we find the executable
|
||||
successfully, save it) */
|
||||
|
||||
for (i = 0; NULL != mca_pls_tm_component.checked_paths &&
|
||||
NULL != mca_pls_tm_component.checked_paths[i]; ++i) {
|
||||
if (0 == strcmp(path, mca_pls_tm_component.checked_paths[i])) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
}
|
||||
|
||||
/* We didn't already find it, so check now. First, get the cwd. */
|
||||
|
||||
do {
|
||||
cwd = malloc(size);
|
||||
if (NULL == cwd) {
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
if (NULL == getcwd(cwd, size)) {
|
||||
free(cwd);
|
||||
if (ERANGE == errno) {
|
||||
size *= 2;
|
||||
} else {
|
||||
return ORTE_ERR_IN_ERRNO;
|
||||
}
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
} while (1);
|
||||
|
||||
/* Now do the search */
|
||||
|
||||
file = opal_path_findv(exe, X_OK, env, cwd);
|
||||
free(cwd);
|
||||
if (NULL == file) {
|
||||
free(path);
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
if (mca_pls_tm_component.debug) {
|
||||
opal_output(0, "pls:tm: found %s", file);
|
||||
}
|
||||
free(file);
|
||||
|
||||
/* Success -- so cache it */
|
||||
|
||||
opal_argv_append_nosize(&mca_pls_tm_component.checked_paths, path);
|
||||
|
||||
/* All done */
|
||||
|
||||
free(path);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
@ -256,7 +256,8 @@ orte_pls_xcpu_launch_job(orte_jobid_t jobid)
|
||||
num_apps = map->num_apps;
|
||||
|
||||
/* next, get the vpid_start and range */
|
||||
rc = orte_rmgr.get_vpid_range(jobid, &vpid_start, &vpid_range);
|
||||
vpid_start = 0;
|
||||
rc = orte_ns.get_vpid_range(jobid, &vpid_range);
|
||||
if (rc != ORTE_SUCCESS) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
@ -375,7 +376,7 @@ int orte_pls_xcpu_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opa
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_pls_xcpu_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t * attrs)
|
||||
int orte_pls_xcpu_terminate_orteds(struct timeval *timeout, opal_list_t * attrs)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
@ -63,7 +63,7 @@ orte_pls_base_module_t* orte_pls_xcpu_init(int *priority); /* in component file
|
||||
*/
|
||||
int orte_pls_xcpu_launch_job(orte_jobid_t);
|
||||
int orte_pls_xcpu_terminate_job(orte_jobid_t, struct timeval *timeout, opal_list_t *);
|
||||
int orte_pls_xcpu_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t * attrs);
|
||||
int orte_pls_xcpu_terminate_orteds(struct timeval *timeout, opal_list_t * attrs);
|
||||
int orte_pls_xcpu_terminate_proc(const orte_process_name_t* proc_name);
|
||||
int orte_pls_xcpu_signal_job(orte_jobid_t jobid, int32_t sig, opal_list_t*);
|
||||
int orte_pls_xcpu_signal_proc(const orte_process_name_t* proc_name, int32_t sig);
|
||||
|
@ -79,9 +79,7 @@ int orte_rmgr_base_check_context_cwd(orte_app_context_t *context,
|
||||
was, barf because they specifically asked for something we
|
||||
can't provide. */
|
||||
if (context->user_specified_cwd) {
|
||||
opal_show_help("help-rmgr-base.txt", "chdir-error",
|
||||
true, hostname, context->cwd, strerror(errno));
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
return ORTE_ERR_WDIR_NOT_FOUND;
|
||||
}
|
||||
|
||||
/* If the user didn't specifically ask for it, then it
|
||||
@ -99,9 +97,7 @@ int orte_rmgr_base_check_context_cwd(orte_app_context_t *context,
|
||||
good = false;
|
||||
}
|
||||
if (!good) {
|
||||
opal_show_help("help-rmgr-base.txt", "chdir-error",
|
||||
true, tmp, strerror(errno));
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
return ORTE_ERR_WDIR_NOT_FOUND;
|
||||
}
|
||||
|
||||
/* Reset the pwd in this local copy of the
|
||||
@ -154,19 +150,13 @@ int orte_rmgr_base_check_context_app(orte_app_context_t *context)
|
||||
free(tmp);
|
||||
tmp = opal_path_findv(context->argv[0], X_OK, environ, context->cwd);
|
||||
if (NULL == tmp) {
|
||||
opal_show_help("help-rmgr-base.txt",
|
||||
"argv0-not-found",
|
||||
true, hostname, context->argv[0]);
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
return ORTE_ERR_EXE_NOT_FOUND;
|
||||
}
|
||||
free(context->app);
|
||||
context->app = tmp;
|
||||
} else {
|
||||
if (0 != access(context->app, X_OK)) {
|
||||
opal_show_help("help-rmgr-base.txt",
|
||||
"argv0-not-accessible",
|
||||
true, hostname, context->argv[0]);
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
return ORTE_ERR_EXE_NOT_ACCESSIBLE;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -65,9 +65,8 @@ orte_rmgr_base_module_t orte_rmgr = {
|
||||
orte_rmgr_base_put_app_context,
|
||||
orte_rmgr_base_check_context_cwd,
|
||||
orte_rmgr_base_check_context_app,
|
||||
orte_rmgr_base_set_vpid_range,
|
||||
orte_rmgr_base_get_vpid_range
|
||||
|
||||
orte_rmgr_base_set_proc_info,
|
||||
orte_rmgr_base_get_proc_info
|
||||
};
|
||||
|
||||
/*
|
||||
|
@ -35,121 +35,6 @@
|
||||
|
||||
#include "orte/mca/rmgr/base/rmgr_private.h"
|
||||
|
||||
/**
|
||||
* Set the vpid start and range for a job/pset on the registry
|
||||
*/
|
||||
|
||||
int orte_rmgr_base_set_vpid_range(orte_jobid_t jobid, orte_vpid_t start, orte_vpid_t range)
|
||||
{
|
||||
orte_gpr_value_t *value;
|
||||
char *segment;
|
||||
int rc;
|
||||
|
||||
if(ORTE_SUCCESS != (rc = orte_schema.get_job_segment_name(&segment, jobid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_gpr.create_value(&value, ORTE_GPR_OVERWRITE, segment, 2, 1))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
free(segment);
|
||||
return rc;
|
||||
}
|
||||
free(segment);
|
||||
value->tokens[0] = strdup(ORTE_JOB_GLOBALS);
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[0]), ORTE_JOB_VPID_START_KEY, ORTE_VPID, &start))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(value);
|
||||
return rc;
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[1]), ORTE_JOB_VPID_RANGE_KEY, ORTE_VPID, &range))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(value);
|
||||
return rc;
|
||||
}
|
||||
|
||||
rc = orte_gpr.put(1, &value);
|
||||
if (ORTE_SUCCESS != rc) ORTE_ERROR_LOG(rc);
|
||||
|
||||
OBJ_RELEASE(value);
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Get the vpid start and range for a job/pset from the registry
|
||||
*/
|
||||
|
||||
int orte_rmgr_base_get_vpid_range(orte_jobid_t jobid, orte_vpid_t *start, orte_vpid_t *range)
|
||||
{
|
||||
char *segment;
|
||||
char *tokens[2];
|
||||
char *keys[3];
|
||||
orte_gpr_value_t** values = NULL;
|
||||
orte_std_cntr_t i, num_values = 0;
|
||||
orte_vpid_t *vptr;
|
||||
int rc;
|
||||
|
||||
/* query the job segment on the registry */
|
||||
if(ORTE_SUCCESS != (rc = orte_schema.get_job_segment_name(&segment, jobid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
tokens[0] = ORTE_JOB_GLOBALS;
|
||||
tokens[1] = NULL;
|
||||
|
||||
keys[0] = ORTE_JOB_VPID_START_KEY;
|
||||
keys[1] = ORTE_JOB_VPID_RANGE_KEY;
|
||||
keys[2] = NULL;
|
||||
|
||||
rc = orte_gpr.get(
|
||||
ORTE_GPR_KEYS_AND|ORTE_GPR_TOKENS_OR,
|
||||
segment,
|
||||
tokens,
|
||||
keys,
|
||||
&num_values,
|
||||
&values
|
||||
);
|
||||
if(rc != ORTE_SUCCESS) {
|
||||
free(segment);
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
if(num_values != 1) {
|
||||
rc = ORTE_ERR_NOT_FOUND;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
for(i=0; i<values[0]->cnt; i++) {
|
||||
if(strcmp(values[0]->keyvals[i]->key, ORTE_JOB_VPID_START_KEY) == 0) {
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&vptr, values[0]->keyvals[i]->value, ORTE_VPID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
*start = *vptr;
|
||||
continue;
|
||||
}
|
||||
if(strcmp(values[0]->keyvals[i]->key, ORTE_JOB_VPID_RANGE_KEY) == 0) {
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&vptr, values[0]->keyvals[i]->value, ORTE_VPID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
*range = *vptr;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
cleanup:
|
||||
for(i=0; i<num_values; i++)
|
||||
OBJ_RELEASE(values[i]);
|
||||
free(segment);
|
||||
free(values);
|
||||
return rc;
|
||||
}
|
||||
|
||||
int orte_rmgr_base_set_proc_info(const orte_process_name_t* name, pid_t pid, char *nodename)
|
||||
{
|
||||
orte_gpr_value_t *values[1];
|
||||
|
@ -80,11 +80,6 @@ ORTE_DECLSPEC int orte_rmgr_base_check_context_app(orte_app_context_t *context);
|
||||
ORTE_DECLSPEC int orte_rmgr_base_check_context_cwd(orte_app_context_t *context,
|
||||
bool want_chdir);
|
||||
|
||||
ORTE_DECLSPEC int orte_rmgr_base_set_vpid_range(orte_jobid_t jobid, orte_vpid_t start, orte_vpid_t range);
|
||||
|
||||
ORTE_DECLSPEC int orte_rmgr_base_get_vpid_range(orte_jobid_t jobid, orte_vpid_t *start, orte_vpid_t *range);
|
||||
|
||||
|
||||
ORTE_DECLSPEC int orte_rmgr_base_set_proc_info(const orte_process_name_t* name, pid_t pid, char * nodename);
|
||||
ORTE_DECLSPEC int orte_rmgr_base_get_proc_info(const orte_process_name_t* name, pid_t* pid, char **nodename);
|
||||
|
||||
|
@ -75,14 +75,6 @@ static int orte_rmgr_cnos_check_context_app(orte_app_context_t *context);
|
||||
static int orte_rmgr_cnos_check_context_cwd(orte_app_context_t *context,
|
||||
bool want_chdir);
|
||||
|
||||
static int orte_rmgr_cnos_set_vpid_range(orte_jobid_t jobid,
|
||||
orte_vpid_t start,
|
||||
orte_vpid_t range);
|
||||
|
||||
static int orte_rmgr_cnos_get_vpid_range(orte_jobid_t jobid,
|
||||
orte_vpid_t *start,
|
||||
orte_vpid_t *range);
|
||||
|
||||
static orte_gpr_keyval_t* orte_rmgr_cnos_find_attribute(opal_list_t* attr_list, char* key);
|
||||
|
||||
static int orte_rmgr_cnos_add_attribute(opal_list_t* attr_list, char* key,
|
||||
@ -114,8 +106,6 @@ orte_rmgr_base_module_t orte_rmgr_cnos_module = {
|
||||
orte_rmgr_cnos_put_app_context,
|
||||
orte_rmgr_cnos_check_context_cwd,
|
||||
orte_rmgr_cnos_check_context_app,
|
||||
orte_rmgr_cnos_set_vpid_range,
|
||||
orte_rmgr_cnos_get_vpid_range,
|
||||
orte_rmgr_cnos_set_proc_info,
|
||||
orte_rmgr_cnos_get_proc_info
|
||||
};
|
||||
@ -196,20 +186,6 @@ static int orte_rmgr_cnos_check_context_cwd(orte_app_context_t *context,
|
||||
return ORTE_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
static int orte_rmgr_cnos_set_vpid_range(orte_jobid_t jobid,
|
||||
orte_vpid_t start,
|
||||
orte_vpid_t range)
|
||||
{
|
||||
return ORTE_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
static int orte_rmgr_cnos_get_vpid_range(orte_jobid_t jobid,
|
||||
orte_vpid_t *start,
|
||||
orte_vpid_t *range)
|
||||
{
|
||||
return ORTE_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
static orte_gpr_keyval_t* orte_rmgr_cnos_find_attribute(opal_list_t* attr_list, char* key)
|
||||
{
|
||||
return NULL;
|
||||
|
@ -74,8 +74,6 @@ orte_rmgr_base_module_t orte_rmgr_proxy_module = {
|
||||
orte_rmgr_base_put_app_context,
|
||||
orte_rmgr_base_check_context_cwd,
|
||||
orte_rmgr_base_check_context_app,
|
||||
orte_rmgr_base_set_vpid_range,
|
||||
orte_rmgr_base_get_vpid_range,
|
||||
orte_rmgr_base_set_proc_info,
|
||||
orte_rmgr_base_get_proc_info
|
||||
};
|
||||
|
@ -242,25 +242,6 @@ typedef int (*orte_rmgr_base_module_check_context_cwd_fn_t)(orte_app_context_t *
|
||||
*/
|
||||
typedef int (*orte_rmgr_base_module_check_context_app_fn_t)(orte_app_context_t *context);
|
||||
|
||||
/**
|
||||
* VPID FUNCTIONS
|
||||
*/
|
||||
|
||||
/**
|
||||
* Store the vpid range of a job
|
||||
*/
|
||||
typedef int (*orte_rmgr_base_module_set_vpid_range_fn_t)(orte_jobid_t jobid,
|
||||
orte_vpid_t start,
|
||||
orte_vpid_t range);
|
||||
|
||||
|
||||
/**
|
||||
* Retrieve the vpid range of a job
|
||||
*/
|
||||
typedef int (*orte_rmgr_base_module_get_vpid_range_fn_t)(orte_jobid_t jobid,
|
||||
orte_vpid_t *start,
|
||||
orte_vpid_t *range);
|
||||
|
||||
/**
|
||||
* Set the process' local PID
|
||||
*/
|
||||
@ -290,8 +271,6 @@ struct orte_rmgr_base_module_2_0_0_t {
|
||||
orte_rmgr_base_module_store_app_context_fn_t store_app_context;
|
||||
orte_rmgr_base_module_check_context_cwd_fn_t check_context_cwd;
|
||||
orte_rmgr_base_module_check_context_app_fn_t check_context_app;
|
||||
orte_rmgr_base_module_set_vpid_range_fn_t set_vpid_range;
|
||||
orte_rmgr_base_module_get_vpid_range_fn_t get_vpid_range;
|
||||
orte_rmgr_base_module_set_process_info_fn_t set_process_info;
|
||||
orte_rmgr_base_module_get_process_info_fn_t get_process_info;
|
||||
};
|
||||
|
@ -89,8 +89,6 @@ orte_rmgr_base_module_t orte_rmgr_urm_module = {
|
||||
orte_rmgr_base_put_app_context,
|
||||
orte_rmgr_base_check_context_cwd,
|
||||
orte_rmgr_base_check_context_app,
|
||||
orte_rmgr_base_set_vpid_range,
|
||||
orte_rmgr_base_get_vpid_range,
|
||||
orte_rmgr_base_set_proc_info,
|
||||
orte_rmgr_base_get_proc_info
|
||||
};
|
||||
|
@ -27,5 +27,5 @@ here's some additional information (which may only be relevant to an
|
||||
Open MPI developer):
|
||||
|
||||
%s failed
|
||||
--> Returned value %d instead of ORTE_SUCCESS
|
||||
--> Returned value %s (%d) instead of ORTE_SUCCESS
|
||||
|
||||
|
@ -206,7 +206,7 @@ int orte_init_stage1(bool infrastructure)
|
||||
/*
|
||||
* Initialize the daemon launch system so those types
|
||||
* are registered (needed by the sds to talk to its
|
||||
* local daemon)
|
||||
* local daemon)
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = orte_odls_base_open())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
@ -282,10 +282,9 @@ int orte_init_stage1(bool infrastructure)
|
||||
|
||||
/*
|
||||
* Now that we know for certain if we are an HNP and/or a daemon,
|
||||
* setup the resource management frameworks. This includes opening
|
||||
* and selecting the daemon launch framework - that framework "knows"
|
||||
* what to do if it isn't in a daemon, and everyone needs that framework
|
||||
* to at least register its datatypes.
|
||||
* setup the resource management frameworks. This includes
|
||||
* selecting the daemon launch framework - that framework "knows"
|
||||
* what to do if it isn't in a daemon.
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = orte_rds_base_open())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
@ -420,12 +419,6 @@ int orte_init_stage1(bool infrastructure)
|
||||
}
|
||||
OBJ_RELEASE(app);
|
||||
|
||||
if (ORTE_SUCCESS != (ret = orte_rmgr.set_vpid_range(my_jobid,0,1))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_rmgr.set_vpid_range for singleton/seed";
|
||||
goto error;
|
||||
}
|
||||
|
||||
if (orte_process_info.singleton) {
|
||||
/* setup a fake node structure - this is required to support
|
||||
* the MPI attributes function that is sitting on a trigger
|
||||
@ -734,7 +727,7 @@ error:
|
||||
if (ret != ORTE_SUCCESS) {
|
||||
opal_show_help("help-orte-runtime",
|
||||
"orte_init:startup:internal-failure",
|
||||
true, error, ret);
|
||||
true, error, ORTE_ERROR_NAME(ret), ret);
|
||||
}
|
||||
|
||||
return ret;
|
||||
|
@ -109,12 +109,12 @@ void orte_daemon_recv_pls(int status, orte_process_name_t* sender,
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
if (orted_globals.debug_daemons) {
|
||||
opal_output(0, "[%lu,%lu,%lu] orted_recv_pls: received kill_local_procs for job %ld",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name), (long)jobs[0]);
|
||||
}
|
||||
|
||||
for (n=0; n < num_jobs; n++) {
|
||||
if (orted_globals.debug_daemons) {
|
||||
opal_output(0, "[%lu,%lu,%lu] orted_recv_pls: received kill_local_procs for job %ld",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name), (long)jobs[n]);
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (ret = orte_odls.kill_local_procs(jobs[n], true))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
}
|
||||
@ -382,7 +382,7 @@ static void halt_vm(void)
|
||||
/* terminate the vm - this will also wake us up so we can exit */
|
||||
OBJ_CONSTRUCT(&attrs, opal_list_t);
|
||||
orte_rmgr.add_attribute(&attrs, ORTE_DAEMON_HARD_KILL, ORTE_UNDEF, NULL, ORTE_RMGR_ATTR_OVERRIDE);
|
||||
ret = orte_pls.terminate_orteds(0, &orte_abort_timeout, &attrs);
|
||||
ret = orte_pls.terminate_orteds(&orte_abort_timeout, &attrs);
|
||||
while (NULL != (item = opal_list_remove_first(&attrs))) OBJ_RELEASE(item);
|
||||
OBJ_DESTRUCT(&attrs);
|
||||
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user