1
1

Complete modifications for failed-to-start of applications. Modifications for failed-to-start of orteds coming next.

This completes the minor changes required to the PLS components. Basically, there is a small change required to the parameter list of the orted cmd functions. I caught and did it for xcpu and poe, in addition to the components listed in my email - so I think that only leaves xgrid unconverted.

The orted fail-to-start mods will also make changes in the PLS components, but those can be localized so they come in one at a time.

This commit was SVN r14499.
Этот коммит содержится в:
Ralph Castain 2007-04-24 20:53:54 +00:00
родитель a764aa6395
Коммит 18cb5c9762
25 изменённых файлов: 298 добавлений и 509 удалений

Просмотреть файл

@ -169,13 +169,6 @@ void orte_pls_base_recv(int status, orte_process_name_t* sender,
break;
case ORTE_PLS_TERMINATE_ORTEDS_CMD:
/* get the jobid whose daemons are to be terminated */
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &job, &count, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
goto SEND_ANSWER;
}
/* get any attributes */
OBJ_CONSTRUCT(&attrs, opal_list_t);
count = 1;
@ -199,7 +192,7 @@ void orte_pls_base_recv(int status, orte_process_name_t* sender,
timeout.tv_usec = microsecs;
/* issue the command */
if (ORTE_SUCCESS != (rc = orte_pls.terminate_orteds(job, &timeout, &attrs))) {
if (ORTE_SUCCESS != (rc = orte_pls.terminate_orteds(&timeout, &attrs))) {
ORTE_ERROR_LOG(rc);
}

Просмотреть файл

@ -46,7 +46,6 @@
#include "opal/mca/installdirs/installdirs.h"
#include "opal/class/opal_list.h"
#include "opal/class/opal_list.h"
#include "opal/event/event.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/util/argv.h"
@ -72,6 +71,7 @@
#include "orte/mca/schema/schema_types.h"
#include "orte/mca/smr/smr.h"
#include "orte/runtime/orte_wait.h"
#include "orte/runtime/orte_wakeup.h"
#include "orte/runtime/runtime.h"
#include "orte/runtime/params.h"
@ -438,6 +438,10 @@ static void orte_pls_bproc_setup_env(char *** env)
* @retval ORTE_SUCCESS
* @retval error
*/
/* When working in this function, ALWAYS jump to "cleanup" if
* you encounter an error so that orterun will be woken up and
* the job can cleanly terminate
*/
static int orte_pls_bproc_launch_daemons(orte_job_map_t *map, char ***envp) {
int * daemon_list = NULL;
int num_daemons = 0;
@ -452,9 +456,6 @@ static int orte_pls_bproc_launch_daemons(orte_job_map_t *map, char ***envp) {
orte_vpid_t daemon_vpid_start;
orte_std_cntr_t idx;
struct stat buf;
opal_list_t daemons;
orte_pls_daemon_info_t *dmn;
opal_list_item_t *item;
struct timeval joblaunchstart, launchstart, launchstop;
OPAL_TRACE(1);
@ -468,11 +469,6 @@ static int orte_pls_bproc_launch_daemons(orte_job_map_t *map, char ***envp) {
/* indicate that the daemons have not completely launched yet */
daemons_launched = false;
/* setup a list that will contain the info for all the daemons
* so we can store it on the registry when done
*/
OBJ_CONSTRUCT(&daemons, opal_list_t);
/* get the number of nodes in this job and allocate an array for
* their names so we can pass that to bproc - populate the list
* with the node names
@ -480,12 +476,12 @@ static int orte_pls_bproc_launch_daemons(orte_job_map_t *map, char ***envp) {
num_daemons = map->num_nodes;
if (0 == num_daemons) {
/* nothing to do */
OBJ_DESTRUCT(&daemons);
return ORTE_SUCCESS;
}
if(NULL == (daemon_list = (int*)malloc(sizeof(int) * num_daemons))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
rc = ORTE_ERR_OUT_OF_RESOURCE;
goto cleanup;
}
i = 0;
@ -500,6 +496,7 @@ static int orte_pls_bproc_launch_daemons(orte_job_map_t *map, char ***envp) {
/* allocate storage for bproc to return the daemon pids */
if(NULL == (pids = (int*)malloc(sizeof(int) * num_daemons))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
rc = ORTE_ERR_OUT_OF_RESOURCE;
goto cleanup;
}
@ -632,6 +629,10 @@ static int orte_pls_bproc_launch_daemons(orte_job_map_t *map, char ***envp) {
rc, *pids);
}
/* we need to be smarter here - right now, we stop on the first negative pid. But
* daemons beyond that one might have started. This could leave a daemon stranded
* when we abort
*/
for(i = 0; i < num_daemons; i++) {
if(0 >= pids[i]) {
opal_show_help("help-pls-bproc.txt", "daemon-launch-bad-pid", true,
@ -650,28 +651,10 @@ static int orte_pls_bproc_launch_daemons(orte_job_map_t *map, char ***envp) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
dmn = OBJ_NEW(orte_pls_daemon_info_t);
rc = orte_ns.create_process_name(&(dmn->name), ORTE_PROC_MY_NAME->cellid, 0,
daemon_vpid_start + i);
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
dmn->cell = dmn->name->cellid;
dmn->nodename = strdup(param);
dmn->active_job = map->job;
opal_list_append(&daemons, &dmn->super);
free(param);
}
}
/* store the daemon info */
if (ORTE_SUCCESS != (rc = orte_pls_base_store_active_daemons(&daemons))) {
ORTE_ERROR_LOG(rc);
}
/* setup the callbacks - this needs to be done *after* we store the
* daemon info so that short-lived apps don't cause mpirun to
* try and terminate the orteds before we record them
@ -718,7 +701,6 @@ static int orte_pls_bproc_launch_daemons(orte_job_map_t *map, char ***envp) {
}
rc = ORTE_ERROR;
ORTE_ERROR_LOG(rc);
orte_pls_bproc_terminate_job(map->job, &orte_abort_timeout, NULL);
goto cleanup;
}
}
@ -747,10 +729,17 @@ cleanup:
if(NULL != orted_path) {
free(orted_path);
}
while (NULL != (item = opal_list_remove_first(&daemons))) {
OBJ_RELEASE(item);
/* check for failed launch - if so, force terminate */
if (!daemons_launched) {
if (ORTE_SUCCESS != (rc = orte_smr.set_job_state(map->job, ORTE_JOB_STATE_FAILED_TO_START))) {
ORTE_ERROR_LOG(rc);
}
if (ORTE_SUCCESS != (rc = orte_wakeup(map->job))) {
ORTE_ERROR_LOG(rc);
}
}
OBJ_DESTRUCT(&daemons);
return rc;
}
@ -784,7 +773,7 @@ orte_pls_bproc_node_failed(orte_gpr_notify_message_t *msg)
orte_pls_bproc_terminate_job(job, &orte_abort_timeout, NULL);
/* kill the daemons */
orte_pls_bproc_terminate_job(0, &orte_abort_timeout, NULL);
orte_pls_bproc_terminate_orteds(&orte_abort_timeout, NULL);
/* shouldn't ever get here.. */
exit(1);
@ -806,9 +795,16 @@ orte_pls_bproc_node_failed(orte_gpr_notify_message_t *msg)
* @retval ORTE_SUCCESS
* @retval error
*/
/* When working in this function, ALWAYS jump to "cleanup" if
* you encounter an error so that orterun will be woken up and
* the job can cleanly terminate. Since we don't use the ORTE
* daemons to launch the application procs, this is the *only*
* way we have of knowing something went wrong.
*/
static int orte_pls_bproc_launch_app(orte_job_map_t* map, int num_slots,
orte_vpid_t vpid_start, int app_context) {
int *node_array, num_nodes, cycle;
int *node_array=NULL, num_nodes, cycle;
int rc, i, j, stride;
orte_std_cntr_t num_processes;
int *pids = NULL;
@ -817,6 +813,7 @@ static int orte_pls_bproc_launch_app(orte_job_map_t* map, int num_slots,
struct bproc_io_t bproc_io[3];
char **env;
int dbg;
bool app_launched = false;
OPAL_TRACE(1);
@ -862,7 +859,8 @@ static int orte_pls_bproc_launch_app(orte_job_map_t* map, int num_slots,
node_array = (int*)malloc(map->num_nodes * sizeof(int));
if (NULL == node_array) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
rc = ORTE_ERR_OUT_OF_RESOURCE;
goto cleanup;
}
/* initialize the cycle count. Computing the process name under Bproc
@ -949,6 +947,10 @@ static int orte_pls_bproc_launch_app(orte_job_map_t* map, int num_slots,
goto cleanup;
}
/* we need to be smarter here - right now, we stop on the first negative pid. But
* processes beyond that one might have started. This leaves those procs stranded
* when we abort
*/
for(j = 0; j < num_nodes; j++) {
if(0 >= pids[j]) {
opal_show_help("help-pls-bproc.txt", "proc-launch-bad-pid", true,
@ -1007,15 +1009,34 @@ static int orte_pls_bproc_launch_app(orte_job_map_t* map, int num_slots,
goto cleanup;
}
}
/* get here if the app procs launched cleanly */
apps_launched = true;
cleanup:
if(NULL != pids) {
free(pids);
}
free(node_array);
if (NULL != node_array) {
free(node_array);
}
if (NULL != env) {
opal_argv_free(env);
}
/* check for failed launch - if so, force terminate */
if (!apps_launched) {
if (ORTE_SUCCESS != (rc = orte_smr.set_job_state(map->job, ORTE_JOB_STATE_FAILED_TO_START))) {
ORTE_ERROR_LOG(rc);
}
if (ORTE_SUCCESS != (rc = orte_wakeup(map->job))) {
ORTE_ERROR_LOG(rc);
}
}
if (NULL != env) opal_argv_free(env);
return rc;
}
@ -1032,8 +1053,13 @@ cleanup:
* @retval ORTE_SUCCESS
* @retval error
*/
/* When working in this function, ALWAYS jump to "cleanup" if
* you encounter an error so that orterun will be woken up and
* the job can cleanly terminate
*/
int orte_pls_bproc_launch(orte_jobid_t jobid) {
orte_job_map_t* map;
orte_job_map_t* map = NULL;
orte_mapped_node_t *map_node;
orte_vpid_t vpid_launch;
int rc;
@ -1043,26 +1069,31 @@ int orte_pls_bproc_launch(orte_jobid_t jobid) {
char cwd_save[OMPI_PATH_MAX + 1];
orte_ras_node_t *ras_node;
char **daemon_env;
bool launched;
OPAL_TRACE(1);
/* indicate the launch condition */
launched = false;
/* make sure the pls_bproc receive function has been started */
if (ORTE_SUCCESS != (rc = orte_pls_bproc_comm_start())) {
ORTE_ERROR_LOG(rc);
return rc;
goto cleanup;
}
/* save the current working directory */
if (NULL == getcwd(cwd_save, sizeof(cwd_save))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
rc = ORTE_ERR_NOT_FOUND;
goto cleanup;
}
cwd_save[sizeof(cwd_save) - 1] = '\0';
/* get the job map */
if(ORTE_SUCCESS != (rc = orte_rmaps.get_job_map(&map, jobid))) {
ORTE_ERROR_LOG(rc);
return rc;
goto cleanup;
}
/* set the mapping mode */
@ -1158,16 +1189,32 @@ int orte_pls_bproc_launch(orte_jobid_t jobid) {
vpid_launch += map->apps[context]->num_procs;
}
/* indicate a successful launch */
launched = true;
cleanup:
chdir(cwd_save);
OBJ_RELEASE(map);
if (NULL != map) {
OBJ_RELEASE(map);
}
if (mca_pls_bproc_component.do_not_launch) {
/* indicate that we failed to launch, but do so silently */
return ORTE_ERR_SILENT;
}
/* check for failed launch - if so, force terminate */
if (!launched) {
if (ORTE_SUCCESS != (rc = orte_smr.set_job_state(jobid, ORTE_JOB_STATE_FAILED_TO_START))) {
ORTE_ERROR_LOG(rc);
}
if (ORTE_SUCCESS != (rc = orte_wakeup(jobid))) {
ORTE_ERROR_LOG(rc);
}
}
return rc;
}
@ -1203,17 +1250,15 @@ int orte_pls_bproc_terminate_job(orte_jobid_t jobid, struct timeval *timeout, op
/**
* Terminate the orteds for a given job
*/
int orte_pls_bproc_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs)
int orte_pls_bproc_terminate_orteds(struct timeval *timeout, opal_list_t *attrs)
{
int rc;
OPAL_TRACE(1);
/* now tell them to die! */
/* tell them to die! */
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_exit(timeout, attrs))) {
ORTE_ERROR_LOG(rc);
}
return rc;
}

Просмотреть файл

@ -43,7 +43,7 @@
static int orte_pls_cnos_launch_job(orte_jobid_t jobid);
static int orte_pls_cnos_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs);
static int orte_pls_cnos_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs);
static int orte_pls_cnos_terminate_orteds(struct timeval *timeout, opal_list_t *attrs);
static int orte_pls_cnos_terminate_proc(const orte_process_name_t* proc_name);
static int orte_pls_cnos_signal_job(orte_jobid_t jobid, int32_t signal, opal_list_t *attrs);
static int orte_pls_cnos_signal_proc(const orte_process_name_t* proc_name, int32_t signal);
@ -91,18 +91,13 @@ static int orte_pls_cnos_terminate_job(orte_jobid_t jobid, struct timeval *timeo
}
static int orte_pls_cnos_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs)
static int orte_pls_cnos_terminate_orteds(struct timeval *timeout, opal_list_t *attrs)
{
orte_jobid_t my_jobid = ORTE_PROC_MY_NAME->jobid;
/* make sure it's my job */
if (jobid == my_jobid) {
#ifdef HAVE_KILLRANK
killrank(-1, SIGKILL);
killrank(-1, SIGKILL);
#else
exit(0);
exit(0);
#endif
}
return ORTE_ERR_NOT_SUPPORTED;
}

Просмотреть файл

@ -207,7 +207,7 @@ typedef int (*orte_pls_base_module_terminate_job_fn_t)(orte_jobid_t, struct time
/**
* Terminate the daemons associated with this jobid
*/
typedef int (*orte_pls_base_module_terminate_orteds_fn_t)(orte_jobid_t, struct timeval *timeout, opal_list_t *attrs);
typedef int (*orte_pls_base_module_terminate_orteds_fn_t)(struct timeval *timeout, opal_list_t *attrs);
/**
* Terminate a specific process.

Просмотреть файл

@ -64,7 +64,7 @@ extern char **environ;
*/
static int pls_poe_launch_job(orte_jobid_t jobid);
static int pls_poe_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs);
static int pls_poe_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs);
static int pls_poe_terminate_orteds(struct timeval *timeout, opal_list_t *attrs);
static int pls_poe_terminate_proc(const orte_process_name_t *name);
static int pls_poe_signal_job(orte_jobid_t jobid, int32_t signal, opal_list_t *attrs);
static int pls_poe_signal_proc(const orte_process_name_t *name, int32_t signal);
@ -477,7 +477,8 @@ static inline int poe_launch_interactive_job(orte_jobid_t jobid)
fclose(hfp);
}
rc = orte_rmgr.get_vpid_range(jobid, &vpid_start, &vpid_range);
vpid_start = 0;
rc = orte_ns.get_vpid_range(jobid, &vpid_range);
if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); goto cleanup; }
/* Create a temporary POE command file */
@ -589,7 +590,7 @@ static int pls_poe_terminate_proc(const orte_process_name_t *name)
return ORTE_ERR_NOT_IMPLEMENTED;
}
static int pls_poe_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs)
static int pls_poe_terminate_orteds(struct timeval *timeout, opal_list_t *attrs)
{
return ORTE_ERR_NOT_IMPLEMENTED;
}

Просмотреть файл

@ -201,7 +201,7 @@ int orte_pls_proxy_terminate_job(orte_jobid_t job, struct timeval *timeout, opal
return ORTE_SUCCESS;
}
int orte_pls_proxy_terminate_orteds(orte_jobid_t job, struct timeval *timeout, opal_list_t *attrs)
int orte_pls_proxy_terminate_orteds(struct timeval *timeout, opal_list_t *attrs)
{
orte_buffer_t* cmd;
orte_buffer_t* answer;
@ -226,12 +226,6 @@ int orte_pls_proxy_terminate_orteds(orte_jobid_t job, struct timeval *timeout, o
return rc;
}
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &job, 1, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, attrs, 1, ORTE_ATTR_LIST))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);

Просмотреть файл

@ -53,7 +53,7 @@ int orte_pls_proxy_finalize(void);
*/
int orte_pls_proxy_launch(orte_jobid_t job);
int orte_pls_proxy_terminate_job(orte_jobid_t job, struct timeval *timeout, opal_list_t *attrs);
int orte_pls_proxy_terminate_orteds(orte_jobid_t job, struct timeval *timeout, opal_list_t *attrs);
int orte_pls_proxy_terminate_orteds(struct timeval *timeout, opal_list_t *attrs);
int orte_pls_proxy_terminate_proc(const orte_process_name_t* name);
int orte_pls_proxy_signal_job(orte_jobid_t job, int32_t signal, opal_list_t *attrs);
int orte_pls_proxy_signal_proc(const orte_process_name_t* name, int32_t signal);

Просмотреть файл

@ -55,7 +55,7 @@ int orte_pls_rsh_finalize(void);
*/
int orte_pls_rsh_launch(orte_jobid_t);
int orte_pls_rsh_terminate_job(orte_jobid_t, struct timeval *timeout, opal_list_t*);
int orte_pls_rsh_terminate_orteds(orte_jobid_t, struct timeval *timeout, opal_list_t*);
int orte_pls_rsh_terminate_orteds(struct timeval *timeout, opal_list_t*);
int orte_pls_rsh_terminate_proc(const orte_process_name_t* proc_name);
int orte_pls_rsh_signal_job(orte_jobid_t, int32_t, opal_list_t*);
int orte_pls_rsh_signal_proc(const orte_process_name_t* proc_name, int32_t);

Просмотреть файл

@ -1071,7 +1071,7 @@ int orte_pls_rsh_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal
/**
* Terminate the orteds for a given job
*/
int orte_pls_rsh_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs)
int orte_pls_rsh_terminate_orteds(struct timeval *timeout, opal_list_t *attrs)
{
int rc;

Просмотреть файл

@ -75,7 +75,7 @@
*/
static int pls_slurm_launch_job(orte_jobid_t jobid);
static int pls_slurm_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs);
static int pls_slurm_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs);
static int pls_slurm_terminate_orteds(struct timeval *timeout, opal_list_t *attrs);
static int pls_slurm_terminate_proc(const orte_process_name_t *name);
static int pls_slurm_signal_job(orte_jobid_t jobid, int32_t signal, opal_list_t *attrs);
static int pls_slurm_signal_proc(const orte_process_name_t *name, int32_t signal);
@ -101,9 +101,10 @@ orte_pls_base_module_1_3_0_t orte_pls_slurm_module = {
};
/*
* Local variable
* Local variables
*/
static pid_t srun_pid = 0;
static orte_jobid_t active_job = ORTE_JOBID_INVALID;
/*
@ -113,16 +114,19 @@ static pid_t srun_pid = 0;
extern char **environ;
#endif /* !defined(__WINDOWS__) */
/* When working in this function, ALWAYS jump to "cleanup" if
* you encounter an error so that orterun will be woken up and
* the job can cleanly terminate
*/
static int pls_slurm_launch_job(orte_jobid_t jobid)
{
orte_job_map_t *map;
orte_job_map_t *map = NULL;
opal_list_item_t *item;
size_t num_nodes;
orte_vpid_t vpid;
orte_vpid_t start_vpid;
char *jobid_string = NULL;
char *param;
char **argv;
char **argv = NULL;
int argc;
int rc;
char *tmp;
@ -136,10 +140,9 @@ static int pls_slurm_launch_job(orte_jobid_t jobid)
char **custom_strings;
int num_args, i;
char *cur_prefix;
opal_list_t daemons;
orte_pls_daemon_info_t *dmn;
struct timeval joblaunchstart, launchstart, launchstop;
int proc_name_index = 0;
bool failed_launch = true;
if (mca_pls_slurm_component.timing) {
if (0 != gettimeofday(&joblaunchstart, NULL)) {
@ -147,10 +150,8 @@ static int pls_slurm_launch_job(orte_jobid_t jobid)
}
}
/* setup a list that will contain the info for all the daemons
* so we can store it on the registry when done
*/
OBJ_CONSTRUCT(&daemons, opal_list_t);
/* save the active jobid */
active_job = jobid;
/* Query the map for this job.
* We need the entire mapping for a couple of reasons:
@ -161,8 +162,7 @@ static int pls_slurm_launch_job(orte_jobid_t jobid)
rc = orte_rmaps.get_job_map(&map, jobid);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&daemons);
return rc;
goto cleanup;
}
/* if the user requested that we re-use daemons,
@ -171,9 +171,7 @@ static int pls_slurm_launch_job(orte_jobid_t jobid)
if (orte_pls_base.reuse_daemons) {
if (ORTE_SUCCESS != (rc = orte_pls_base_launch_on_existing_daemons(map))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(map);
OBJ_DESTRUCT(&daemons);
return rc;
goto cleanup;
}
}
@ -186,14 +184,13 @@ static int pls_slurm_launch_job(orte_jobid_t jobid)
* on existing daemons, so we can just return
*/
OBJ_RELEASE(map);
OBJ_DESTRUCT(&daemons);
return ORTE_SUCCESS;
}
rc = orte_ns.reserve_range(0, num_nodes, &vpid);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
start_vpid = vpid;
/* setup the orted triggers for passing their launch info */
if (ORTE_SUCCESS != (rc = orte_smr.init_orted_stage_gates(jobid, num_nodes, NULL, NULL))) {
@ -332,31 +329,6 @@ static int pls_slurm_launch_job(orte_jobid_t jobid)
}
}
/* setup the daemon info for each node */
vpid = start_vpid;
for (item = opal_list_get_first(&map->nodes);
item != opal_list_get_end(&map->nodes);
item = opal_list_get_next(item)) {
orte_mapped_node_t* node = (orte_mapped_node_t*)item;
/* record the daemons info for this node */
dmn = OBJ_NEW(orte_pls_daemon_info_t);
dmn->active_job = jobid;
dmn->cell = node->cell;
dmn->nodename = strdup(node->nodename);
if (ORTE_SUCCESS != (rc = orte_ns.create_process_name(&(dmn->name), dmn->cell, 0, vpid))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
opal_list_append(&daemons, &dmn->super);
vpid++;
}
/* store the daemon info on the registry */
if (ORTE_SUCCESS != (rc = orte_pls_base_store_active_daemons(&daemons))) {
ORTE_ERROR_LOG(rc);
}
/* setup environment */
env = opal_argv_copy(environ);
var = mca_base_param_environ_variable("seed", NULL, NULL);
@ -374,7 +346,19 @@ static int pls_slurm_launch_job(orte_jobid_t jobid)
}
/* exec the daemon */
rc = pls_slurm_start_proc(argc, argv, env, cur_prefix);
if (ORTE_SUCCESS != (rc = pls_slurm_start_proc(argc, argv, env, cur_prefix))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* do NOT wait for srun to complete. Srun only completes when the processes
* it starts - in this case, the orteds - complete. We need to go ahead and
* return so orterun can do the rest of its stuff. Instead, we'll catch
* any srun failures and deal with them elsewhere
*/
/* declare the launch a success */
failed_launch = false;
if (mca_pls_slurm_component.timing) {
if (0 != gettimeofday(&launchstop, NULL)) {
@ -395,21 +379,32 @@ static int pls_slurm_launch_job(orte_jobid_t jobid)
}
/* JMS: short we stash the srun pid in the gpr somewhere for cleanup? */
/* JMS: how do we catch when srun dies? */
cleanup:
OBJ_RELEASE(map);
opal_argv_free(argv);
opal_argv_free(env);
if (NULL != map) {
OBJ_RELEASE(map);
}
if (NULL != argv) {
opal_argv_free(argv);
}
if (NULL != env) {
opal_argv_free(env);
}
if(NULL != jobid_string) {
free(jobid_string);
}
while (NULL != (item = opal_list_remove_first(&daemons))) {
OBJ_RELEASE(item);
/* check for failed launch - if so, force terminate */
if (failed_launch) {
if (ORTE_SUCCESS != (rc = orte_smr.set_job_state(jobid, ORTE_JOB_STATE_FAILED_TO_START))) {
ORTE_ERROR_LOG(rc);
}
if (ORTE_SUCCESS != (rc = orte_wakeup(jobid))) {
ORTE_ERROR_LOG(rc);
}
}
OBJ_DESTRUCT(&daemons);
return rc;
}
@ -431,11 +426,18 @@ static int pls_slurm_terminate_job(orte_jobid_t jobid, struct timeval *timeout,
/**
* Terminate the orteds for a given job
*/
static int pls_slurm_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs)
static int pls_slurm_terminate_orteds(struct timeval *timeout, opal_list_t *attrs)
{
int rc;
/* order them to go away */
/* deregister the waitpid callback to ensure we don't make it look like
* srun failed when it didn't. Since the srun may have already completed,
* do NOT ERROR_LOG any return code to avoid confusing, duplicate error
* messages
*/
orte_wait_cb_cancel(srun_pid);
/* tell them to die! */
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_exit(timeout, attrs))) {
ORTE_ERROR_LOG(rc);
}
@ -495,7 +497,7 @@ static int pls_slurm_cancel_operation(void)
static int pls_slurm_finalize(void)
{
int rc;
/* cleanup any pending recvs */
if (ORTE_SUCCESS != (rc = orte_pls_base_comm_stop())) {
ORTE_ERROR_LOG(rc);
@ -505,6 +507,46 @@ static int pls_slurm_finalize(void)
}
static void srun_wait_cb(pid_t pid, int status, void* cbdata){
/* According to the SLURM folks, srun always returns the highest exit
code of our remote processes. Thus, a non-zero exit status doesn't
necessarily mean that srun failed - it could be that an orted returned
a non-zero exit status. Of course, that means the orted failed(!), so
the end result is the same - the job didn't start.
As a result, we really can't do much with the exit status itself - it
could be something in errno (if srun itself failed), or it could be
something returned by an orted, or it could be something returned by
the OS (e.g., couldn't find the orted binary). Somebody is welcome
to sort out all the options and pretty-print a better error message. For
now, though, the only thing that really matters is that
srun failed. Report the error and make sure that orterun
wakes up - otherwise, do nothing!
*/
int rc;
if (0 != status) {
/* we have a problem */
opal_output(0, "ERROR: srun failed to start the required daemons.");
opal_output(0, "ERROR: This could be due to an inability to find the orted binary");
opal_output(0, "ERROR: on one or more remote nodes, lack of authority to execute");
opal_output(0, "ERROR: on one or more specified nodes, or other factors.");
/* set the job state so we know it failed to start */
if (ORTE_SUCCESS != (rc = orte_smr.set_job_state(active_job, ORTE_JOB_STATE_FAILED_TO_START))) {
ORTE_ERROR_LOG(rc);
}
/* force termination of the job */
if (ORTE_SUCCESS != (rc = orte_wakeup(active_job))) {
ORTE_ERROR_LOG(rc);
}
}
}
static int pls_slurm_start_proc(int argc, char **argv, char **env,
char *prefix)
{
@ -517,9 +559,11 @@ static int pls_slurm_start_proc(int argc, char **argv, char **env,
srun_pid = fork();
if (-1 == srun_pid) {
opal_output(0, "pls:slurm:start_proc: fork failed");
return ORTE_ERR_IN_ERRNO;
} else if (0 == srun_pid) {
ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_CHILDREN);
return ORTE_ERR_SYS_LIMITS_CHILDREN;
}
if (0 == srun_pid) { /* child */
char *bin_base = NULL, *lib_base = NULL;
/* Figure out the basenames for the libdir and bindir. There
@ -596,14 +640,16 @@ static int pls_slurm_start_proc(int argc, char **argv, char **env,
/* don't return - need to exit - returning would be bad -
we're not in the calling process anymore */
exit(1);
} else { /* parent */
/* just in case, make sure that the srun process is not in our
process group any more. Stevens says always do this on both
sides of the fork... */
setpgid(srun_pid, srun_pid);
/* setup the waitpid so we can find out if srun succeeds! */
orte_wait_cb(srun_pid, srun_wait_cb, NULL);
free(exec_argv);
}
free(exec_argv);
/* just in case, make sure that the srun process is not in our
process group any more. Stevens says always do this on both
sides of the fork... */
setpgid(srun_pid, srun_pid);
return ORTE_SUCCESS;
}

Просмотреть файл

@ -25,20 +25,17 @@ The first two prefix values supplied for node %s were:
%s
and %s
#
[daemon-not-found]
The TM (PBS / Torqus) process starter in Open MPI was unable to find
its daemon executable (orted) on the node where mpirun was executed.
[tm-spawn-failed]
The TM (PBS / Torque) process starter failed to spawn a daemon (orted)
on a remote node.
This sanity check is performed because the back-end PBS / Torque
process launcher does not provide any kind of error to Open MPI if it
tries to launch its daemon on a remote node, but the daemon cannot be
found. Open MPI's check for the daemon locally is somewhat of a lame
workaround / sanity check.
Command line: %s
Node name: %s
Launch id: %d
If you do not understand this error mesage, please try the following:
1. Try to add the Open MPI executables to your PATH
2. Use the --prefix option to mpirun to indicate where Open MPI can
find its executables
3. Set the MCA parameter "pls_tm_want_path_check" to 0
4. Talk to your local system administration
1. Ensure that the executable "orted" is in your PATH
2. Use the --prefix option to indicate where we can
find that executable
3. Talk to your local system administrator

Просмотреть файл

@ -60,6 +60,7 @@
#include "orte/orte_types.h"
#include "orte/runtime/runtime.h"
#include "orte/runtime/orte_wait.h"
#include "orte/runtime/orte_wakeup.h"
#include "orte/mca/pls/pls.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/smr/smr.h"
@ -80,7 +81,7 @@
*/
static int pls_tm_launch_job(orte_jobid_t jobid);
static int pls_tm_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs);
static int pls_tm_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs);
static int pls_tm_terminate_orteds(struct timeval *timeout, opal_list_t *attrs);
static int pls_tm_terminate_proc(const orte_process_name_t *name);
static int pls_tm_signal_job(orte_jobid_t jobid, int32_t signal, opal_list_t *attrs);
static int pls_tm_signal_proc(const orte_process_name_t *name, int32_t signal);
@ -89,7 +90,6 @@ static int pls_tm_finalize(void);
static int pls_tm_connect(void);
static int pls_tm_disconnect(void);
static int pls_tm_check_path(char *exe, char **env);
/*
* Local variables
@ -114,19 +114,23 @@ orte_pls_base_module_t orte_pls_tm_module = {
extern char **environ;
#endif /* !defined(__WINDOWS__) */
/* When working in this function, ALWAYS jump to "cleanup" if
* you encounter an error so that orterun will be woken up and
* the job can cleanly terminate
*/
static int pls_tm_launch_job(orte_jobid_t jobid)
{
orte_job_map_t *map;
orte_job_map_t *map = NULL;
opal_list_item_t *item;
size_t num_nodes;
orte_vpid_t vpid;
int node_name_index;
int proc_name_index;
char *jobid_string;
char *param;
char **env;
char *uri, *param;
char **env = NULL;
char *var;
char **argv;
char **argv = NULL;
int argc;
int rc;
bool connected = false;
@ -136,12 +140,11 @@ static int pls_tm_launch_job(orte_jobid_t jobid)
tm_task_id *tm_task_ids = NULL;
int local_err;
tm_event_t event;
opal_list_t daemons;
orte_pls_daemon_info_t *dmn;
struct timeval launchstart, launchstop, completionstart, completionstop;
struct timeval jobstart, jobstop;
int maxtime=0, mintime=99999999, maxiter = 0, miniter = 0, deltat;
float avgtime=0.0;
bool failed_launch = true;
/* check for timing request - get start time if so */
if (mca_pls_tm_component.timing) {
@ -158,7 +161,7 @@ static int pls_tm_launch_job(orte_jobid_t jobid)
rc = orte_rmaps.get_job_map(&map, jobid);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
return rc;
goto cleanup;
}
/* if the user requested that we re-use daemons,
@ -167,8 +170,7 @@ static int pls_tm_launch_job(orte_jobid_t jobid)
if (orte_pls_base.reuse_daemons) {
if (ORTE_SUCCESS != (rc = orte_pls_base_launch_on_existing_daemons(map))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(map);
return rc;
goto cleanup;
}
}
@ -184,6 +186,7 @@ static int pls_tm_launch_job(orte_jobid_t jobid)
*/
rc = orte_ns.reserve_range(0, num_nodes, &vpid);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
@ -193,20 +196,17 @@ static int pls_tm_launch_job(orte_jobid_t jobid)
goto cleanup;
}
/* setup a list that will contain the info for all the daemons
* so we can store it on the registry when done
*/
OBJ_CONSTRUCT(&daemons, opal_list_t);
/* Allocate a bunch of TM events to use for tm_spawn()ing */
tm_events = malloc(sizeof(tm_event_t) * num_nodes);
if (NULL == tm_events) {
rc = ORTE_ERR_OUT_OF_RESOURCE;
ORTE_ERROR_LOG(rc);
goto cleanup;
}
tm_task_ids = malloc(sizeof(tm_task_id) * num_nodes);
if (NULL == tm_task_ids) {
rc = ORTE_ERR_OUT_OF_RESOURCE;
ORTE_ERROR_LOG(rc);
goto cleanup;
}
@ -294,17 +294,6 @@ static int pls_tm_launch_job(orte_jobid_t jobid)
}
}
/* Do a quick sanity check to ensure that we can find the
orted in the PATH */
if (ORTE_SUCCESS !=
(rc = pls_tm_check_path(argv[0], env))) {
ORTE_ERROR_LOG(rc);
opal_show_help("help-pls-tm.txt", "daemon-not-found",
true, argv[0]);
goto cleanup;
}
/* Iterate through each of the nodes and spin
* up a daemon.
*/
@ -315,19 +304,10 @@ static int pls_tm_launch_job(orte_jobid_t jobid)
orte_process_name_t* name;
char* name_string;
/* new daemon - setup to record its info */
dmn = OBJ_NEW(orte_pls_daemon_info_t);
dmn->active_job = jobid;
opal_list_append(&daemons, &dmn->super);
/* setup node name */
free(argv[node_name_index]);
argv[node_name_index] = strdup(node->nodename);
/* record the node name in the daemon struct */
dmn->cell = node->cell;
dmn->nodename = strdup(node->nodename);
/* initialize daemons process name */
rc = orte_ns.create_process_name(&name, node->cell, 0, vpid);
if (ORTE_SUCCESS != rc) {
@ -335,12 +315,6 @@ static int pls_tm_launch_job(orte_jobid_t jobid)
goto cleanup;
}
/* save it in the daemon struct */
if (ORTE_SUCCESS != (rc = orte_dss.copy((void**)&(dmn->name), name, ORTE_NAME))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* setup per-node options */
if (mca_pls_tm_component.debug ||
mca_pls_tm_component.verbose) {
@ -352,7 +326,7 @@ static int pls_tm_launch_job(orte_jobid_t jobid)
rc = orte_ns.get_proc_name_string(&name_string, name);
if (ORTE_SUCCESS != rc) {
opal_output(0, "pls:tm: unable to create process name");
return rc;
goto cleanup;
}
free(argv[proc_name_index]);
argv[proc_name_index] = strdup(name_string);
@ -377,13 +351,12 @@ static int pls_tm_launch_job(orte_jobid_t jobid)
rc = tm_spawn(argc, argv, env, node->launch_id, tm_task_ids + launched, tm_events + launched);
if (TM_SUCCESS != rc) {
return ORTE_ERROR;
}
if (ORTE_SUCCESS != rc) {
opal_output(0, "pls:tm: start_procs returned error %d", rc);
opal_show_help("help-pls-tm.txt", "tm-spawn-failed",
true, argv[0], node->nodename, node->launch_id);
rc = ORTE_ERROR;
goto cleanup;
}
/* check for timing request - get stop time and process if so */
if (mca_pls_tm_component.timing) {
if (0 != gettimeofday(&launchstop, NULL)) {
@ -423,21 +396,19 @@ static int pls_tm_launch_job(orte_jobid_t jobid)
}
}
/* all done, so store the daemon info on the registry */
if (ORTE_SUCCESS != (rc = orte_pls_base_store_active_daemons(&daemons))) {
ORTE_ERROR_LOG(rc);
}
/* TM poll for all the spawns */
for (i = 0; i < launched; ++i) {
rc = tm_poll(TM_NULL_EVENT, &event, 1, &local_err);
if (TM_SUCCESS != rc) {
errno = local_err;
opal_output(0, "pls:tm: failed to poll for a spawned proc, return status = %d", rc);
return ORTE_ERR_IN_ERRNO;
goto cleanup;
}
}
/* if we get here, then everything launched okay - record that fact */
failed_launch = false;
/* check for timing request - get stop time for launch completion and report */
if (mca_pls_tm_component.timing) {
if (0 != gettimeofday(&completionstop, NULL)) {
@ -455,7 +426,15 @@ static int pls_tm_launch_job(orte_jobid_t jobid)
cleanup:
OBJ_RELEASE(map);
if (NULL != map) {
OBJ_RELEASE(map);
}
if (NULL != argv) {
opal_argv_free(argv);
}
if (NULL != env) {
opal_argv_free(env);
}
if (connected) {
pls_tm_disconnect();
@ -474,12 +453,17 @@ static int pls_tm_launch_job(orte_jobid_t jobid)
free(bin_base);
}
/* deconstruct the daemon list */
while (NULL != (item = opal_list_remove_first(&daemons))) {
OBJ_RELEASE(item);
/* check for failed launch - if so, force terminate */
if (failed_launch) {
if (ORTE_SUCCESS != (rc = orte_smr.set_job_state(jobid, ORTE_JOB_STATE_FAILED_TO_START))) {
ORTE_ERROR_LOG(rc);
}
if (ORTE_SUCCESS != (rc = orte_wakeup(jobid))) {
ORTE_ERROR_LOG(rc);
}
}
OBJ_DESTRUCT(&daemons);
/* check for timing request - get stop time and process if so */
if (mca_pls_tm_component.timing) {
if (0 != gettimeofday(&jobstop, NULL)) {
@ -502,11 +486,11 @@ static int pls_tm_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opa
{
int rc;
/* order them to kill their local procs for this job */
/* order all of the daemons to kill their local procs for this job */
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_kill_local_procs(jobid, timeout, attrs))) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
@ -514,7 +498,7 @@ static int pls_tm_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opa
/**
* Terminate the orteds for a given job
*/
int pls_tm_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t *attrs)
int pls_tm_terminate_orteds(struct timeval *timeout, opal_list_t *attrs)
{
int rc;
@ -620,83 +604,3 @@ static int pls_tm_disconnect(void)
return ORTE_SUCCESS;
}
static int pls_tm_check_path(char *exe, char **env)
{
static int size = 256;
int i;
char *file;
char *cwd;
char *path = NULL;
/* Do we want this check at all? */
if (!mca_pls_tm_component.want_path_check) {
return ORTE_SUCCESS;
}
/* Find the path in the supplied environment */
for (i = 0; NULL != env[i]; ++i) {
if (0 == strncmp("PATH=", env[i], 5)) {
path = strdup(env[i]);
break;
}
}
if (NULL == env[i]) {
path = strdup("NULL");
}
/* Check the already-successful paths (i.e., be a little
friendlier to the filesystem -- if we find the executable
successfully, save it) */
for (i = 0; NULL != mca_pls_tm_component.checked_paths &&
NULL != mca_pls_tm_component.checked_paths[i]; ++i) {
if (0 == strcmp(path, mca_pls_tm_component.checked_paths[i])) {
return ORTE_SUCCESS;
}
}
/* We didn't already find it, so check now. First, get the cwd. */
do {
cwd = malloc(size);
if (NULL == cwd) {
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (NULL == getcwd(cwd, size)) {
free(cwd);
if (ERANGE == errno) {
size *= 2;
} else {
return ORTE_ERR_IN_ERRNO;
}
} else {
break;
}
} while (1);
/* Now do the search */
file = opal_path_findv(exe, X_OK, env, cwd);
free(cwd);
if (NULL == file) {
free(path);
return ORTE_ERR_NOT_FOUND;
}
if (mca_pls_tm_component.debug) {
opal_output(0, "pls:tm: found %s", file);
}
free(file);
/* Success -- so cache it */
opal_argv_append_nosize(&mca_pls_tm_component.checked_paths, path);
/* All done */
free(path);
return ORTE_SUCCESS;
}

Просмотреть файл

@ -256,7 +256,8 @@ orte_pls_xcpu_launch_job(orte_jobid_t jobid)
num_apps = map->num_apps;
/* next, get the vpid_start and range */
rc = orte_rmgr.get_vpid_range(jobid, &vpid_start, &vpid_range);
vpid_start = 0;
rc = orte_ns.get_vpid_range(jobid, &vpid_range);
if (rc != ORTE_SUCCESS) {
ORTE_ERROR_LOG(rc);
return rc;
@ -375,7 +376,7 @@ int orte_pls_xcpu_terminate_job(orte_jobid_t jobid, struct timeval *timeout, opa
return ORTE_SUCCESS;
}
int orte_pls_xcpu_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t * attrs)
int orte_pls_xcpu_terminate_orteds(struct timeval *timeout, opal_list_t * attrs)
{
return ORTE_SUCCESS;
}

Просмотреть файл

@ -63,7 +63,7 @@ orte_pls_base_module_t* orte_pls_xcpu_init(int *priority); /* in component file
*/
int orte_pls_xcpu_launch_job(orte_jobid_t);
int orte_pls_xcpu_terminate_job(orte_jobid_t, struct timeval *timeout, opal_list_t *);
int orte_pls_xcpu_terminate_orteds(orte_jobid_t jobid, struct timeval *timeout, opal_list_t * attrs);
int orte_pls_xcpu_terminate_orteds(struct timeval *timeout, opal_list_t * attrs);
int orte_pls_xcpu_terminate_proc(const orte_process_name_t* proc_name);
int orte_pls_xcpu_signal_job(orte_jobid_t jobid, int32_t sig, opal_list_t*);
int orte_pls_xcpu_signal_proc(const orte_process_name_t* proc_name, int32_t sig);

Просмотреть файл

@ -79,9 +79,7 @@ int orte_rmgr_base_check_context_cwd(orte_app_context_t *context,
was, barf because they specifically asked for something we
can't provide. */
if (context->user_specified_cwd) {
opal_show_help("help-rmgr-base.txt", "chdir-error",
true, hostname, context->cwd, strerror(errno));
return ORTE_ERR_NOT_FOUND;
return ORTE_ERR_WDIR_NOT_FOUND;
}
/* If the user didn't specifically ask for it, then it
@ -99,9 +97,7 @@ int orte_rmgr_base_check_context_cwd(orte_app_context_t *context,
good = false;
}
if (!good) {
opal_show_help("help-rmgr-base.txt", "chdir-error",
true, tmp, strerror(errno));
return ORTE_ERR_NOT_FOUND;
return ORTE_ERR_WDIR_NOT_FOUND;
}
/* Reset the pwd in this local copy of the
@ -154,19 +150,13 @@ int orte_rmgr_base_check_context_app(orte_app_context_t *context)
free(tmp);
tmp = opal_path_findv(context->argv[0], X_OK, environ, context->cwd);
if (NULL == tmp) {
opal_show_help("help-rmgr-base.txt",
"argv0-not-found",
true, hostname, context->argv[0]);
return ORTE_ERR_NOT_FOUND;
return ORTE_ERR_EXE_NOT_FOUND;
}
free(context->app);
context->app = tmp;
} else {
if (0 != access(context->app, X_OK)) {
opal_show_help("help-rmgr-base.txt",
"argv0-not-accessible",
true, hostname, context->argv[0]);
return ORTE_ERR_NOT_FOUND;
return ORTE_ERR_EXE_NOT_ACCESSIBLE;
}
}

Просмотреть файл

@ -65,9 +65,8 @@ orte_rmgr_base_module_t orte_rmgr = {
orte_rmgr_base_put_app_context,
orte_rmgr_base_check_context_cwd,
orte_rmgr_base_check_context_app,
orte_rmgr_base_set_vpid_range,
orte_rmgr_base_get_vpid_range
orte_rmgr_base_set_proc_info,
orte_rmgr_base_get_proc_info
};
/*

Просмотреть файл

@ -35,121 +35,6 @@
#include "orte/mca/rmgr/base/rmgr_private.h"
/**
* Set the vpid start and range for a job/pset on the registry
*/
int orte_rmgr_base_set_vpid_range(orte_jobid_t jobid, orte_vpid_t start, orte_vpid_t range)
{
orte_gpr_value_t *value;
char *segment;
int rc;
if(ORTE_SUCCESS != (rc = orte_schema.get_job_segment_name(&segment, jobid))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_gpr.create_value(&value, ORTE_GPR_OVERWRITE, segment, 2, 1))) {
ORTE_ERROR_LOG(rc);
free(segment);
return rc;
}
free(segment);
value->tokens[0] = strdup(ORTE_JOB_GLOBALS);
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[0]), ORTE_JOB_VPID_START_KEY, ORTE_VPID, &start))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(value);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[1]), ORTE_JOB_VPID_RANGE_KEY, ORTE_VPID, &range))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(value);
return rc;
}
rc = orte_gpr.put(1, &value);
if (ORTE_SUCCESS != rc) ORTE_ERROR_LOG(rc);
OBJ_RELEASE(value);
return rc;
}
/**
* Get the vpid start and range for a job/pset from the registry
*/
int orte_rmgr_base_get_vpid_range(orte_jobid_t jobid, orte_vpid_t *start, orte_vpid_t *range)
{
char *segment;
char *tokens[2];
char *keys[3];
orte_gpr_value_t** values = NULL;
orte_std_cntr_t i, num_values = 0;
orte_vpid_t *vptr;
int rc;
/* query the job segment on the registry */
if(ORTE_SUCCESS != (rc = orte_schema.get_job_segment_name(&segment, jobid))) {
ORTE_ERROR_LOG(rc);
return rc;
}
tokens[0] = ORTE_JOB_GLOBALS;
tokens[1] = NULL;
keys[0] = ORTE_JOB_VPID_START_KEY;
keys[1] = ORTE_JOB_VPID_RANGE_KEY;
keys[2] = NULL;
rc = orte_gpr.get(
ORTE_GPR_KEYS_AND|ORTE_GPR_TOKENS_OR,
segment,
tokens,
keys,
&num_values,
&values
);
if(rc != ORTE_SUCCESS) {
free(segment);
ORTE_ERROR_LOG(rc);
return rc;
}
if(num_values != 1) {
rc = ORTE_ERR_NOT_FOUND;
ORTE_ERROR_LOG(rc);
goto cleanup;
}
for(i=0; i<values[0]->cnt; i++) {
if(strcmp(values[0]->keyvals[i]->key, ORTE_JOB_VPID_START_KEY) == 0) {
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&vptr, values[0]->keyvals[i]->value, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
*start = *vptr;
continue;
}
if(strcmp(values[0]->keyvals[i]->key, ORTE_JOB_VPID_RANGE_KEY) == 0) {
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&vptr, values[0]->keyvals[i]->value, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
*range = *vptr;
continue;
}
}
cleanup:
for(i=0; i<num_values; i++)
OBJ_RELEASE(values[i]);
free(segment);
free(values);
return rc;
}
int orte_rmgr_base_set_proc_info(const orte_process_name_t* name, pid_t pid, char *nodename)
{
orte_gpr_value_t *values[1];

Просмотреть файл

@ -80,11 +80,6 @@ ORTE_DECLSPEC int orte_rmgr_base_check_context_app(orte_app_context_t *context);
ORTE_DECLSPEC int orte_rmgr_base_check_context_cwd(orte_app_context_t *context,
bool want_chdir);
ORTE_DECLSPEC int orte_rmgr_base_set_vpid_range(orte_jobid_t jobid, orte_vpid_t start, orte_vpid_t range);
ORTE_DECLSPEC int orte_rmgr_base_get_vpid_range(orte_jobid_t jobid, orte_vpid_t *start, orte_vpid_t *range);
ORTE_DECLSPEC int orte_rmgr_base_set_proc_info(const orte_process_name_t* name, pid_t pid, char * nodename);
ORTE_DECLSPEC int orte_rmgr_base_get_proc_info(const orte_process_name_t* name, pid_t* pid, char **nodename);

Просмотреть файл

@ -75,14 +75,6 @@ static int orte_rmgr_cnos_check_context_app(orte_app_context_t *context);
static int orte_rmgr_cnos_check_context_cwd(orte_app_context_t *context,
bool want_chdir);
static int orte_rmgr_cnos_set_vpid_range(orte_jobid_t jobid,
orte_vpid_t start,
orte_vpid_t range);
static int orte_rmgr_cnos_get_vpid_range(orte_jobid_t jobid,
orte_vpid_t *start,
orte_vpid_t *range);
static orte_gpr_keyval_t* orte_rmgr_cnos_find_attribute(opal_list_t* attr_list, char* key);
static int orte_rmgr_cnos_add_attribute(opal_list_t* attr_list, char* key,
@ -114,8 +106,6 @@ orte_rmgr_base_module_t orte_rmgr_cnos_module = {
orte_rmgr_cnos_put_app_context,
orte_rmgr_cnos_check_context_cwd,
orte_rmgr_cnos_check_context_app,
orte_rmgr_cnos_set_vpid_range,
orte_rmgr_cnos_get_vpid_range,
orte_rmgr_cnos_set_proc_info,
orte_rmgr_cnos_get_proc_info
};
@ -196,20 +186,6 @@ static int orte_rmgr_cnos_check_context_cwd(orte_app_context_t *context,
return ORTE_ERR_NOT_SUPPORTED;
}
static int orte_rmgr_cnos_set_vpid_range(orte_jobid_t jobid,
orte_vpid_t start,
orte_vpid_t range)
{
return ORTE_ERR_NOT_SUPPORTED;
}
static int orte_rmgr_cnos_get_vpid_range(orte_jobid_t jobid,
orte_vpid_t *start,
orte_vpid_t *range)
{
return ORTE_ERR_NOT_SUPPORTED;
}
static orte_gpr_keyval_t* orte_rmgr_cnos_find_attribute(opal_list_t* attr_list, char* key)
{
return NULL;

Просмотреть файл

@ -74,8 +74,6 @@ orte_rmgr_base_module_t orte_rmgr_proxy_module = {
orte_rmgr_base_put_app_context,
orte_rmgr_base_check_context_cwd,
orte_rmgr_base_check_context_app,
orte_rmgr_base_set_vpid_range,
orte_rmgr_base_get_vpid_range,
orte_rmgr_base_set_proc_info,
orte_rmgr_base_get_proc_info
};

Просмотреть файл

@ -242,25 +242,6 @@ typedef int (*orte_rmgr_base_module_check_context_cwd_fn_t)(orte_app_context_t *
*/
typedef int (*orte_rmgr_base_module_check_context_app_fn_t)(orte_app_context_t *context);
/**
* VPID FUNCTIONS
*/
/**
* Store the vpid range of a job
*/
typedef int (*orte_rmgr_base_module_set_vpid_range_fn_t)(orte_jobid_t jobid,
orte_vpid_t start,
orte_vpid_t range);
/**
* Retrieve the vpid range of a job
*/
typedef int (*orte_rmgr_base_module_get_vpid_range_fn_t)(orte_jobid_t jobid,
orte_vpid_t *start,
orte_vpid_t *range);
/**
* Set the process' local PID
*/
@ -290,8 +271,6 @@ struct orte_rmgr_base_module_2_0_0_t {
orte_rmgr_base_module_store_app_context_fn_t store_app_context;
orte_rmgr_base_module_check_context_cwd_fn_t check_context_cwd;
orte_rmgr_base_module_check_context_app_fn_t check_context_app;
orte_rmgr_base_module_set_vpid_range_fn_t set_vpid_range;
orte_rmgr_base_module_get_vpid_range_fn_t get_vpid_range;
orte_rmgr_base_module_set_process_info_fn_t set_process_info;
orte_rmgr_base_module_get_process_info_fn_t get_process_info;
};

Просмотреть файл

@ -89,8 +89,6 @@ orte_rmgr_base_module_t orte_rmgr_urm_module = {
orte_rmgr_base_put_app_context,
orte_rmgr_base_check_context_cwd,
orte_rmgr_base_check_context_app,
orte_rmgr_base_set_vpid_range,
orte_rmgr_base_get_vpid_range,
orte_rmgr_base_set_proc_info,
orte_rmgr_base_get_proc_info
};

Просмотреть файл

@ -27,5 +27,5 @@ here's some additional information (which may only be relevant to an
Open MPI developer):
%s failed
--> Returned value %d instead of ORTE_SUCCESS
--> Returned value %s (%d) instead of ORTE_SUCCESS

Просмотреть файл

@ -206,7 +206,7 @@ int orte_init_stage1(bool infrastructure)
/*
* Initialize the daemon launch system so those types
* are registered (needed by the sds to talk to its
* local daemon)
* local daemon)
*/
if (ORTE_SUCCESS != (ret = orte_odls_base_open())) {
ORTE_ERROR_LOG(ret);
@ -282,10 +282,9 @@ int orte_init_stage1(bool infrastructure)
/*
* Now that we know for certain if we are an HNP and/or a daemon,
* setup the resource management frameworks. This includes opening
* and selecting the daemon launch framework - that framework "knows"
* what to do if it isn't in a daemon, and everyone needs that framework
* to at least register its datatypes.
* setup the resource management frameworks. This includes
* selecting the daemon launch framework - that framework "knows"
* what to do if it isn't in a daemon.
*/
if (ORTE_SUCCESS != (ret = orte_rds_base_open())) {
ORTE_ERROR_LOG(ret);
@ -420,12 +419,6 @@ int orte_init_stage1(bool infrastructure)
}
OBJ_RELEASE(app);
if (ORTE_SUCCESS != (ret = orte_rmgr.set_vpid_range(my_jobid,0,1))) {
ORTE_ERROR_LOG(ret);
error = "orte_rmgr.set_vpid_range for singleton/seed";
goto error;
}
if (orte_process_info.singleton) {
/* setup a fake node structure - this is required to support
* the MPI attributes function that is sitting on a trigger
@ -734,7 +727,7 @@ error:
if (ret != ORTE_SUCCESS) {
opal_show_help("help-orte-runtime",
"orte_init:startup:internal-failure",
true, error, ret);
true, error, ORTE_ERROR_NAME(ret), ret);
}
return ret;

Просмотреть файл

@ -109,12 +109,12 @@ void orte_daemon_recv_pls(int status, orte_process_name_t* sender,
goto CLEANUP;
}
if (orted_globals.debug_daemons) {
opal_output(0, "[%lu,%lu,%lu] orted_recv_pls: received kill_local_procs for job %ld",
ORTE_NAME_ARGS(orte_process_info.my_name), (long)jobs[0]);
}
for (n=0; n < num_jobs; n++) {
if (orted_globals.debug_daemons) {
opal_output(0, "[%lu,%lu,%lu] orted_recv_pls: received kill_local_procs for job %ld",
ORTE_NAME_ARGS(orte_process_info.my_name), (long)jobs[n]);
}
if (ORTE_SUCCESS != (ret = orte_odls.kill_local_procs(jobs[n], true))) {
ORTE_ERROR_LOG(ret);
}
@ -382,7 +382,7 @@ static void halt_vm(void)
/* terminate the vm - this will also wake us up so we can exit */
OBJ_CONSTRUCT(&attrs, opal_list_t);
orte_rmgr.add_attribute(&attrs, ORTE_DAEMON_HARD_KILL, ORTE_UNDEF, NULL, ORTE_RMGR_ATTR_OVERRIDE);
ret = orte_pls.terminate_orteds(0, &orte_abort_timeout, &attrs);
ret = orte_pls.terminate_orteds(&orte_abort_timeout, &attrs);
while (NULL != (item = opal_list_remove_first(&attrs))) OBJ_RELEASE(item);
OBJ_DESTRUCT(&attrs);