1
1

Modify the accounting system to recycle jobids. Properly recover resources from nodes and jobs upon completion. Adjustments in several places were required to deal with sparsely populated job, node, and proc arrays as a result of this change.

Correct an error wrt how jobids were being computed. Needed to ensure that the job family field was not overrun as we increment jobids for comm_spawn.

Update the slurm plm module so it uses the new slurm termination procedure (brings trunk back into alignment with 1.3 branch).

Update the slurmd ess component so it doesn't get selected if we are running a singleton inside of a slurm allocation.

Cleanup HNP init by moving some code that had been in orte_globals.c for historical reasons into the ess hnp module, and removing the call to that code from the ess_base_std_prolog


NOTE: this change allows orte to support an infinite aggregate number of comm_spawn's, with up to 64k being alive at any one instant. HOWEVER, the MPI layer currently does -not- support re-use of jobids. I did some prototype coding to revise the ompi_proc_t structures, but the BTLs are caching their own data, and there was no readily apparent way to update it. Thus, attempts to spawn more than the 64k limit will abort to avoid causing the MPI layer to hang.

This commit was SVN r20700.
Этот коммит содержится в:
Ralph Castain 2009-03-03 16:39:13 +00:00
родитель fb1ecb7a45
Коммит f11931306a
14 изменённых файлов: 307 добавлений и 145 удалений

Просмотреть файл

@ -77,11 +77,11 @@ void orte_errmgr_default_proc_aborted(orte_process_name_t *name, int exit_code)
*/ */
jobs = (orte_job_t**)orte_job_data->addr; jobs = (orte_job_t**)orte_job_data->addr;
for (i=1; i < orte_job_data->size; i++) { for (i=1; i < orte_job_data->size; i++) {
/* the array is left justfied, so we can quit once /* the array may have holes in it as we are recovering
* we see a NULL * jobids as they complete, so check everything
*/ */
if (NULL == jobs[i]) { if (NULL == jobs[i]) {
break; continue;
} }
if (ORTE_JOB_STATE_ABORTED != jobs[i]->state && if (ORTE_JOB_STATE_ABORTED != jobs[i]->state &&
ORTE_JOB_STATE_ABORTED_BY_SIG != jobs[i]->state && ORTE_JOB_STATE_ABORTED_BY_SIG != jobs[i]->state &&

Просмотреть файл

@ -46,17 +46,6 @@ int orte_ess_base_std_prolog(void)
goto error; goto error;
} }
/* if I'm the HNP, make sure that the daemon flag is NOT set so that
* components unique to non-HNP orteds can be selected and init
* my basic storage elements
*/
if (orte_process_info.hnp) {
if (ORTE_SUCCESS != (ret = orte_hnp_globals_init())) {
error = "orte_hnp_globals_init";
goto error;
}
}
/* /*
* Internal startup * Internal startup
*/ */

Просмотреть файл

@ -327,11 +327,32 @@ static int rte_init(char flags)
} }
free(contact_path); free(contact_path);
/* setup the global job and node arrays */
orte_job_data = OBJ_NEW(opal_pointer_array_t);
if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_job_data,
1,
ORTE_GLOBAL_ARRAY_MAX_SIZE,
1))) {
ORTE_ERROR_LOG(ret);
error = "setup job array";
goto error;
}
orte_node_pool = OBJ_NEW(opal_pointer_array_t);
if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_pool,
ORTE_GLOBAL_ARRAY_BLOCK_SIZE,
ORTE_GLOBAL_ARRAY_MAX_SIZE,
ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) {
ORTE_ERROR_LOG(ret);
error = "setup node array";
goto error;
}
/* Setup the job data object for the daemons */ /* Setup the job data object for the daemons */
/* create and store the job data object */ /* create and store the job data object */
jdata = OBJ_NEW(orte_job_t); jdata = OBJ_NEW(orte_job_t);
jdata->jobid = ORTE_PROC_MY_NAME->jobid; jdata->jobid = ORTE_PROC_MY_NAME->jobid;
opal_pointer_array_add(orte_job_data, jdata); opal_pointer_array_set_item(orte_job_data, 0, jdata);
/* create and store a node object where we are */ /* create and store a node object where we are */
node = OBJ_NEW(orte_node_t); node = OBJ_NEW(orte_node_t);
@ -465,6 +486,7 @@ static int rte_finalize(void)
{ {
char *contact_path; char *contact_path;
opal_list_item_t *item; opal_list_item_t *item;
int i;
/* remove my contact info file */ /* remove my contact info file */
contact_path = opal_os_path(false, orte_process_info.top_session_dir, contact_path = opal_os_path(false, orte_process_info.top_session_dir,
@ -517,6 +539,24 @@ static int rte_finalize(void)
} }
OBJ_DESTRUCT(&orte_local_jobdata); OBJ_DESTRUCT(&orte_local_jobdata);
/* cleanup the job and node info arrays */
if (NULL != orte_node_pool) {
for (i=0; i < orte_node_pool->size; i++) {
if (NULL != orte_node_pool->addr[i]) {
OBJ_RELEASE(orte_node_pool->addr[i]);
}
}
OBJ_RELEASE(orte_node_pool);
}
if (NULL != orte_job_data) {
for (i=0; i < orte_job_data->size; i++) {
if (NULL != orte_job_data->addr[i]) {
OBJ_RELEASE(orte_job_data->addr[i]);
}
}
OBJ_RELEASE(orte_job_data);
}
/* finalize the session directory tree */ /* finalize the session directory tree */
orte_session_dir_finalize(ORTE_PROC_MY_NAME); orte_session_dir_finalize(ORTE_PROC_MY_NAME);

Просмотреть файл

@ -76,6 +76,7 @@ int orte_ess_slurmd_component_query(mca_base_module_t **module, int *priority)
if (orte_process_info.mpi_proc && if (orte_process_info.mpi_proc &&
NULL != getenv("SLURM_JOBID") && NULL != getenv("SLURM_JOBID") &&
NULL != getenv("SLURM_STEPID") &&
NULL == orte_process_info.my_hnp_uri) { NULL == orte_process_info.my_hnp_uri) {
*priority = 30; *priority = 30;
*module = (mca_base_module_t *)&orte_ess_slurmd_module; *module = (mca_base_module_t *)&orte_ess_slurmd_module;

Просмотреть файл

@ -88,6 +88,9 @@ orte_ess_base_module_t orte_ess_slurmd_module = {
NULL /* ft_event */ NULL /* ft_event */
}; };
/* Local globals */
static bool app_init_complete;
/**** MODULE FUNCTIONS ****/ /**** MODULE FUNCTIONS ****/
static int rte_init(char flags) static int rte_init(char flags)
@ -110,6 +113,9 @@ static int rte_init(char flags)
int *ppn; int *ppn;
bool block=false, cyclic=false; bool block=false, cyclic=false;
/* init flag */
app_init_complete = false;
/* run the prolog */ /* run the prolog */
if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) { if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) {
error = "orte_ess_base_std_prolog"; error = "orte_ess_base_std_prolog";
@ -331,6 +337,9 @@ static int rte_init(char flags)
goto error; goto error;
} }
/* flag that we completed init */
app_init_complete = true;
return ORTE_SUCCESS; return ORTE_SUCCESS;
error: error:
@ -345,12 +354,17 @@ static int rte_finalize(void)
{ {
int ret; int ret;
/* use the default procedure to finish */ if (app_init_complete) {
if (ORTE_SUCCESS != (ret = orte_ess_base_app_finalize())) { /* use the default procedure to finish */
ORTE_ERROR_LOG(ret); if (ORTE_SUCCESS != (ret = orte_ess_base_app_finalize())) {
ORTE_ERROR_LOG(ret);
}
} }
/* deconstruct my nidmap and jobmap arrays */ /* deconstruct my nidmap and jobmap arrays - this
* function protects itself from being called
* before things were initialized
*/
orte_util_nidmap_finalize(); orte_util_nidmap_finalize();
return ret; return ret;

Просмотреть файл

@ -63,7 +63,7 @@ int orte_plm_base_set_hnp_name(void)
ORTE_PROC_MY_NAME->jobid = 0xffff0000 & ((uint32_t)jobfam << 16); ORTE_PROC_MY_NAME->jobid = 0xffff0000 & ((uint32_t)jobfam << 16);
ORTE_PROC_MY_NAME->vpid = 0; ORTE_PROC_MY_NAME->vpid = 0;
orte_plm_globals.next_jobid = ORTE_PROC_MY_NAME->jobid + 1; orte_plm_globals.next_jobid = 1;
/* copy it to the HNP field */ /* copy it to the HNP field */
ORTE_PROC_MY_HNP->jobid = ORTE_PROC_MY_NAME->jobid; ORTE_PROC_MY_HNP->jobid = ORTE_PROC_MY_NAME->jobid;
@ -78,12 +78,38 @@ int orte_plm_base_set_hnp_name(void)
*/ */
int orte_plm_base_create_jobid(orte_jobid_t *jobid) int orte_plm_base_create_jobid(orte_jobid_t *jobid)
{ {
if (ORTE_JOBID_MAX-1 < orte_plm_globals.next_jobid) { #if 0
orte_job_t **jobs;
int32_t j;
/* RHC: WHILE ORTE CAN NOW HANDLE RECYCLING OF JOBID'S,
* THE MPI LAYER CANNOT SINCE THERE IS NO WAY TO
* UPDATE THE OMPI_PROC_T LIST AND/OR THE BTL'S
*/
/* see if there is a prior
* jobid that has completed and can be re-used. It can
* never be 0 as that belongs to the HNP and its daemons
*/
jobs = (orte_job_t**)orte_job_data->addr;
for (j=1; j < orte_job_data->size; j++) {
if (NULL == jobs[j]) {
/* this local jobid is available - reuse it */
*jobid = ORTE_CONSTRUCT_LOCAL_JOBID(ORTE_PROC_MY_NAME->jobid, j);
return ORTE_SUCCESS;
}
}
#endif
if (UINT16_MAX == orte_plm_globals.next_jobid) {
/* if we get here, then no local jobids are available */
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
*jobid = ORTE_JOBID_INVALID; *jobid = ORTE_JOBID_INVALID;
return ORTE_ERR_OUT_OF_RESOURCE; return ORTE_ERR_OUT_OF_RESOURCE;
} }
*jobid = orte_plm_globals.next_jobid++; /* take the next jobid */
*jobid = ORTE_CONSTRUCT_LOCAL_JOBID(ORTE_PROC_MY_NAME->jobid, orte_plm_globals.next_jobid);
orte_plm_globals.next_jobid++;
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }

Просмотреть файл

@ -66,6 +66,7 @@ int orte_plm_base_setup_job(orte_job_t *jdata)
{ {
orte_job_t *jdatorted; orte_job_t *jdatorted;
int rc; int rc;
int32_t ljob;
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:setup_job for job %s", "%s plm:base:setup_job for job %s",
@ -73,7 +74,8 @@ int orte_plm_base_setup_job(orte_job_t *jdata)
ORTE_JOBID_PRINT(jdata->jobid))); ORTE_JOBID_PRINT(jdata->jobid)));
/* insert the job object into the global pool */ /* insert the job object into the global pool */
opal_pointer_array_add(orte_job_data, jdata); ljob = ORTE_LOCAL_JOBID(jdata->jobid);
opal_pointer_array_set_item(orte_job_data, ljob, jdata);
if (ORTE_SUCCESS != (rc = orte_ras.allocate(jdata))) { if (ORTE_SUCCESS != (rc = orte_ras.allocate(jdata))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
@ -1135,9 +1137,13 @@ int orte_plm_base_orted_append_basic_args(int *argc, char ***argv,
void orte_plm_base_check_job_completed(orte_job_t *jdata) void orte_plm_base_check_job_completed(orte_job_t *jdata)
{ {
orte_proc_t **procs; orte_proc_t **procs;
orte_vpid_t i; int i;
orte_std_cntr_t j; orte_std_cntr_t j;
orte_job_t **jobs; orte_job_t **jobs;
orte_node_t **nodes;
orte_job_map_t *map;
orte_std_cntr_t index;
bool one_still_alive;
/* if the incoming job data pointer is NULL, then all we can do /* if the incoming job data pointer is NULL, then all we can do
* is check all jobs for complete * is check all jobs for complete
@ -1167,7 +1173,13 @@ void orte_plm_base_check_job_completed(orte_job_t *jdata)
* as abnormally terminated, then do not update its state * as abnormally terminated, then do not update its state
*/ */
if (jdata->state < ORTE_JOB_STATE_TERMINATED) { if (jdata->state < ORTE_JOB_STATE_TERMINATED) {
for (i=0; i < jdata->num_procs; i++) { for (i=0; i < jdata->procs->size; i++) {
/* the proc array may no longer be left justified, so
* we need to check everything
*/
if (NULL == procs[i]) {
continue;
}
if (ORTE_PROC_STATE_FAILED_TO_START == procs[i]->state) { if (ORTE_PROC_STATE_FAILED_TO_START == procs[i]->state) {
jdata->state = ORTE_JOB_STATE_FAILED_TO_START; jdata->state = ORTE_JOB_STATE_FAILED_TO_START;
if (!jdata->abort) { if (!jdata->abort) {
@ -1217,25 +1229,6 @@ void orte_plm_base_check_job_completed(orte_job_t *jdata)
} }
} }
/* Release the resources used by this job. */
if( NULL != jdata->map ) {
int i, index;
orte_node_t* daemon;
orte_proc_t* proc;
orte_job_map_t* map;
map = jdata->map;
for( index = 0; index < map->num_nodes; index++ ) {
daemon = (orte_node_t *) opal_pointer_array_get_item( map->nodes, index );
for( i = 0; i < (int)daemon->num_procs; i++ ) {
proc = (orte_proc_t *) opal_pointer_array_get_item(daemon->procs, i);
if( (NULL != proc) && (proc->name.jobid == jdata->jobid) ) {
daemon->slots_inuse--;
}
}
}
}
/* check the resulting job state and notify the appropriate places */ /* check the resulting job state and notify the appropriate places */
if (ORTE_JOB_STATE_FAILED_TO_START == jdata->state) { if (ORTE_JOB_STATE_FAILED_TO_START == jdata->state) {
@ -1299,14 +1292,65 @@ CHECK_ALL_JOBS:
return; return;
} }
} }
/* Release the resources used by this job. Since some errmgr's may want
* to continue using resources allocated to the job as part of their
* fault recovery procedure, we only do this once the job is "complete".
* Note that an aborted/killed job -is- flagged as complete and will
* therefore have its resources released.
*/
if( NULL != jdata->map ) {
map = jdata->map;
nodes = (orte_node_t**)map->nodes->addr;
for( index = 0; index < map->nodes->size; index++ ) {
if (NULL == nodes[index]) {
/* the nodes in a map are left-justfied and
* there are no holes in the array
*/
break;
}
procs = (orte_proc_t**)nodes[index]->procs->addr;
for( i = 0; i < nodes[index]->procs->size; i++ ) {
if (NULL == procs[i]) {
/* there can be holes in the proc array since
* we are cleaning up as we go
*/
continue;
}
if(procs[i]->name.jobid == jdata->jobid) {
nodes[index]->slots_inuse--;
nodes[index]->num_procs--;
/* release this object, ensuring that the
* pointer array internal accounting
* is maintained!
*/
OBJ_RELEASE(procs[i]);
opal_pointer_array_set_item(nodes[index]->procs, i, NULL);
}
}
}
}
/* now check to see if all jobs are done - release this jdata
* object when we find it
*/
jobs = (orte_job_t**)orte_job_data->addr; jobs = (orte_job_t**)orte_job_data->addr;
for (j=0; j < orte_job_data->size; j++) { one_still_alive = false;
for (j=1; j < orte_job_data->size; j++) {
if (NULL == jobs[j]) { if (NULL == jobs[j]) {
/* the jobs are left-justified in the array, so /* since we are releasing jdata objects as we
* if we find a NULL, that means we are past all * go, we can no longer assume that the job_data
* the jobs so we can just quit the loop * array is left justified
*/ */
break; continue;
}
if (NULL != jdata && jobs[j]->jobid == jdata->jobid) {
/* release this object, ensuring that the
* pointer array internal accounting
* is maintained!
*/
OBJ_RELEASE(jdata);
opal_pointer_array_set_item(orte_job_data, j, NULL);
continue;
} }
/* if the job is flagged to not be monitored, skip it */ /* if the job is flagged to not be monitored, skip it */
if (ORTE_JOB_CONTROL_DO_NOT_MONITOR & jobs[j]->controls) { if (ORTE_JOB_CONTROL_DO_NOT_MONITOR & jobs[j]->controls) {
@ -1315,20 +1359,33 @@ CHECK_ALL_JOBS:
/* when checking for job termination, we must be sure to NOT check /* when checking for job termination, we must be sure to NOT check
* our own job as it - rather obviously - has NOT terminated! * our own job as it - rather obviously - has NOT terminated!
*/ */
if (ORTE_PROC_MY_NAME->jobid != jobs[j]->jobid && if (jobs[j]->num_terminated < jobs[j]->num_procs) {
jobs[j]->num_terminated < jobs[j]->num_procs) { /* we have at least one job that is not done yet - we cannot
/* we have at least one job that is not done yet */ * just return, though, as we need to ensure we cleanout the
* job data for the job that just completed
*/
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:check_job_completed job %s is not terminated", "%s plm:base:check_job_completed job %s is not terminated",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jobs[j]->jobid))); ORTE_JOBID_PRINT(jobs[j]->jobid)));
return; one_still_alive = true;
} }
} }
/* if a job is still alive, we just return */
if (one_still_alive) {
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:check_job_completed at least one job is not terminated",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
return;
}
/* if we get here, then all jobs are done, so wakeup */ /* if we get here, then all jobs are done, so wakeup */
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:check_job_completed all jobs terminated - waking up", "%s plm:base:check_job_completed all jobs terminated - waking up",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* set the exit status to 0 - this will only happen if it
* wasn't already set by an error condition
*/
ORTE_UPDATE_EXIT_STATUS(0);
orte_trigger_event(&orte_exit); orte_trigger_event(&orte_exit);
} }

Просмотреть файл

@ -52,7 +52,7 @@ typedef struct {
/* orted cmd cond */ /* orted cmd cond */
opal_condition_t orted_cmd_cond; opal_condition_t orted_cmd_cond;
/* next jobid */ /* next jobid */
orte_jobid_t next_jobid; uint16_t next_jobid;
/* time when daemons started launch */ /* time when daemons started launch */
struct timeval daemonlaunchstart; struct timeval daemonlaunchstart;
/* rsh launch agent path */ /* rsh launch agent path */

Просмотреть файл

@ -102,9 +102,10 @@ orte_plm_base_module_1_0_0_t orte_plm_slurm_module = {
/* /*
* Local variables * Local variables
*/ */
static pid_t srun_pid = 0; static pid_t primary_srun_pid = 0;
static bool primary_pid_set = false;
static orte_jobid_t active_job = ORTE_JOBID_INVALID; static orte_jobid_t active_job = ORTE_JOBID_INVALID;
static bool failed_launch; static bool launching_daemons;
static bool local_launch_available = false; static bool local_launch_available = false;
/** /**
@ -153,7 +154,8 @@ static int plm_slurm_launch_job(orte_job_t *jdata)
struct timeval launchstart, launchstop; struct timeval launchstart, launchstop;
int proc_vpid_index; int proc_vpid_index;
orte_jobid_t failed_job; orte_jobid_t failed_job;
bool failed_launch=true;
if (jdata->controls & ORTE_JOB_CONTROL_LOCAL_SLAVE) { if (jdata->controls & ORTE_JOB_CONTROL_LOCAL_SLAVE) {
/* if this is a request to launch a local slave, /* if this is a request to launch a local slave,
* then we will not be launching an orted - we will * then we will not be launching an orted - we will
@ -187,7 +189,7 @@ static int plm_slurm_launch_job(orte_job_t *jdata)
} }
/* indicate the state of the launch */ /* indicate the state of the launch */
failed_launch = true; launching_daemons = true;
/* create a jobid for this job */ /* create a jobid for this job */
if (ORTE_SUCCESS != (rc = orte_plm_base_create_jobid(&jdata->jobid))) { if (ORTE_SUCCESS != (rc = orte_plm_base_create_jobid(&jdata->jobid))) {
@ -398,6 +400,7 @@ static int plm_slurm_launch_job(orte_job_t *jdata)
launch_apps: launch_apps:
/* get here if daemons launch okay - any failures now by apps */ /* get here if daemons launch okay - any failures now by apps */
launching_daemons = false;
failed_job = active_job; failed_job = active_job;
if (ORTE_SUCCESS != (rc = orte_plm_base_launch_apps(active_job))) { if (ORTE_SUCCESS != (rc = orte_plm_base_launch_apps(active_job))) {
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
@ -465,19 +468,31 @@ static int plm_slurm_terminate_job(orte_jobid_t jobid)
static int plm_slurm_terminate_orteds(void) static int plm_slurm_terminate_orteds(void)
{ {
int rc; int rc;
orte_job_t *jdata;
/* deregister the waitpid callback to ensure we don't make it look like /* tell them to die without sending a reply - we will rely on the
* srun failed when it didn't. Since the srun may have already completed, * waitpid to tell us when they have exited!
* do NOT ERROR_LOG any return code to avoid confusing, duplicate error
* messages
*/ */
orte_wait_cb_cancel(srun_pid); if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_NO_REPLY_CMD))) {
/* tell them to die! */
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_WITH_REPLY_CMD))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
} }
/* check to see if the primary pid is set. If not, this indicates
* that we never launched any additional daemons, so we cannot
* not wait for a waitpid to fire and tell us it's okay to
* exit. Instead, we simply trigger an exit for ourselves
*/
if (!primary_pid_set) {
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:slurm: primary daemons complete!",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
jdata->state = ORTE_JOB_STATE_TERMINATED;
/* need to set the #terminated value to avoid an incorrect error msg */
jdata->num_terminated = jdata->num_procs;
orte_trigger_event(&orteds_exit);
}
return rc; return rc;
} }
@ -512,40 +527,63 @@ static int plm_slurm_finalize(void)
static void srun_wait_cb(pid_t pid, int status, void* cbdata){ static void srun_wait_cb(pid_t pid, int status, void* cbdata){
orte_job_t *jdata;
/* According to the SLURM folks, srun always returns the highest exit /* According to the SLURM folks, srun always returns the highest exit
code of our remote processes. Thus, a non-zero exit status doesn't code of our remote processes. Thus, a non-zero exit status doesn't
necessarily mean that srun failed - it could be that an orted returned necessarily mean that srun failed - it could be that an orted returned
a non-zero exit status. Of course, that means the orted failed(!), so a non-zero exit status. Of course, that means the orted failed(!), so
the end result is the same - the job didn't start. the end result is the same - the job didn't start.
As a result, we really can't do much with the exit status itself - it
could be something in errno (if srun itself failed), or it could be
something returned by an orted, or it could be something returned by
the OS (e.g., couldn't find the orted binary). Somebody is welcome
to sort out all the options and pretty-print a better error message. For
now, though, the only thing that really matters is that
srun failed. Report the error and make sure that orterun
wakes up - otherwise, do nothing!
Unfortunately, the pid returned here is the srun pid, not the pid of As a result, we really can't do much with the exit status itself - it
the proc that actually died! So, to avoid confusion, just use -1 as the could be something in errno (if srun itself failed), or it could be
pid so nobody thinks this is real something returned by an orted, or it could be something returned by
*/ the OS (e.g., couldn't find the orted binary). Somebody is welcome
to sort out all the options and pretty-print a better error message. For
now, though, the only thing that really matters is that
srun failed. Report the error and make sure that orterun
wakes up - otherwise, do nothing!
Unfortunately, the pid returned here is the srun pid, not the pid of
the proc that actually died! So, to avoid confusion, just use -1 as the
pid so nobody thinks this is real
*/
if (0 != status) { /* if we are in the launch phase, then any termination is bad */
if (failed_launch) { if (launching_daemons) {
/* report that the daemon has failed so we can exit /* report that one or more daemons failed to launch so we can exit */
*/ OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
orte_plm_base_launch_failed(ORTE_PROC_MY_NAME->jobid, -1, status, ORTE_JOB_STATE_FAILED_TO_START); "%s plm:slurm: daemon failed during launch",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
} else { orte_plm_base_launch_failed(ORTE_PROC_MY_NAME->jobid, -1, status, ORTE_JOB_STATE_FAILED_TO_START);
} else {
/* if this is after launch, then we need to abort only if the status
* returned is non-zero - i.e., if the orteds exited with an error
*/
if (0 != status) {
/* an orted must have died unexpectedly after launch - report /* an orted must have died unexpectedly after launch - report
* that the daemon has failed so we exit * that the daemon has failed so we exit
*/ */
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:slurm: daemon failed while running",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
orte_plm_base_launch_failed(ORTE_PROC_MY_NAME->jobid, -1, status, ORTE_JOB_STATE_ABORTED); orte_plm_base_launch_failed(ORTE_PROC_MY_NAME->jobid, -1, status, ORTE_JOB_STATE_ABORTED);
} }
/* otherwise, check to see if this is the primary pid */
if (primary_srun_pid == pid) {
/* in this case, we just want to fire the proper trigger so
* mpirun can exit
*/
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:slurm: primary daemons complete!",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
jdata->state = ORTE_JOB_STATE_TERMINATED;
/* need to set the #terminated value to avoid an incorrect error msg */
jdata->num_terminated = jdata->num_procs;
orte_trigger_event(&orteds_exit);
}
} }
} }
@ -553,6 +591,7 @@ static int plm_slurm_start_proc(int argc, char **argv, char **env,
char *prefix) char *prefix)
{ {
int fd; int fd;
int srun_pid;
char *exec_argv = opal_path_findv(argv[0], 0, env, NULL); char *exec_argv = opal_path_findv(argv[0], 0, env, NULL);
if (NULL == exec_argv) { if (NULL == exec_argv) {
@ -653,6 +692,14 @@ static int plm_slurm_start_proc(int argc, char **argv, char **env,
sides of the fork... */ sides of the fork... */
setpgid(srun_pid, srun_pid); setpgid(srun_pid, srun_pid);
/* if this is the primary launch - i.e., not a comm_spawn of a
* child job - then save the pid
*/
if (!primary_pid_set) {
primary_srun_pid = srun_pid;
primary_pid_set = true;
}
/* setup the waitpid so we can find out if srun succeeds! */ /* setup the waitpid so we can find out if srun succeeds! */
orte_wait_cb(srun_pid, srun_wait_cb, NULL); orte_wait_cb(srun_pid, srun_wait_cb, NULL);
free(exec_argv); free(exec_argv);

Просмотреть файл

@ -352,7 +352,7 @@ int orte_rmaps_base_claim_slot(orte_job_t *jdata,
int orte_rmaps_base_compute_usage(orte_job_t *jdata) int orte_rmaps_base_compute_usage(orte_job_t *jdata)
{ {
orte_std_cntr_t i; orte_std_cntr_t i;
orte_vpid_t j, k; int j, k;
orte_node_t **nodes; orte_node_t **nodes;
orte_proc_t **procs, *psave, *psave2; orte_proc_t **procs, *psave, *psave2;
orte_vpid_t minv, minv2; orte_vpid_t minv, minv2;
@ -378,13 +378,24 @@ int orte_rmaps_base_compute_usage(orte_job_t *jdata)
procs = (orte_proc_t**)nodes[i]->procs->addr; procs = (orte_proc_t**)nodes[i]->procs->addr;
local_rank = 0; local_rank = 0;
for (k=0; k < nodes[i]->num_procs; k++) { /* the node map may have holes in it, so cycle
* all the way through and avoid the holes
*/
for (k=0; k < nodes[i]->procs->size; k++) {
/* if this proc is NULL, skip it */
if (NULL == procs[k]) {
continue;
}
minv = ORTE_VPID_MAX; minv = ORTE_VPID_MAX;
minv2 = ORTE_VPID_MAX; minv2 = ORTE_VPID_MAX;
psave = NULL; psave = NULL;
psave2 = NULL; psave2 = NULL;
/* find the minimum vpid proc */ /* find the minimum vpid proc */
for (j=0; j < nodes[i]->num_procs; j++) { for (j=0; j < nodes[i]->procs->size; j++) {
/* if this proc is NULL, skip it */
if (NULL == procs[j]) {
continue;
}
if (procs[j]->name.jobid == jdata->jobid && if (procs[j]->name.jobid == jdata->jobid &&
ORTE_LOCAL_RANK_MAX == procs[j]->local_rank && ORTE_LOCAL_RANK_MAX == procs[j]->local_rank &&
procs[j]->name.vpid < minv) { procs[j]->name.vpid < minv) {

Просмотреть файл

@ -407,31 +407,6 @@ int orte_dt_init(void)
} }
#if !ORTE_DISABLE_FULL_SUPPORT #if !ORTE_DISABLE_FULL_SUPPORT
int orte_hnp_globals_init(void)
{
int rc;
orte_job_data = OBJ_NEW(opal_pointer_array_t);
if (ORTE_SUCCESS != (rc = opal_pointer_array_init(orte_job_data,
1,
ORTE_GLOBAL_ARRAY_MAX_SIZE,
1))) {
ORTE_ERROR_LOG(rc);
return rc;
}
orte_node_pool = OBJ_NEW(opal_pointer_array_t);
if (ORTE_SUCCESS != (rc = opal_pointer_array_init(orte_node_pool,
ORTE_GLOBAL_ARRAY_BLOCK_SIZE,
ORTE_GLOBAL_ARRAY_MAX_SIZE,
ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) {
ORTE_ERROR_LOG(rc);
return rc;
}
return ORTE_SUCCESS;
}
orte_job_t* orte_get_job_data_object(orte_jobid_t job) orte_job_t* orte_get_job_data_object(orte_jobid_t job)
{ {
@ -575,20 +550,22 @@ static void orte_job_construct(orte_job_t* job)
static void orte_job_destruct(orte_job_t* job) static void orte_job_destruct(orte_job_t* job)
{ {
orte_std_cntr_t i; orte_std_cntr_t i;
orte_vpid_t j; int n;
for (i=0; i < job->num_apps; i++) { for (i=0; i < job->num_apps; i++) {
if (NULL != job->apps->addr[i]) OBJ_RELEASE(job->apps->addr[i]); if (NULL != job->apps->addr[i]) OBJ_RELEASE(job->apps->addr[i]);
} }
OBJ_RELEASE(job->apps); OBJ_RELEASE(job->apps);
for (j=0; j < job->num_procs; j++) { if (NULL != job->map) OBJ_RELEASE(job->map);
if (NULL != job->procs->addr[j]) OBJ_RELEASE(job->procs->addr[j]);
for (n=0; n < job->procs->size; n++) {
if (NULL != job->procs->addr[n]) {
OBJ_RELEASE(job->procs->addr[n]);
}
} }
OBJ_RELEASE(job->procs); OBJ_RELEASE(job->procs);
if (NULL != job->map) OBJ_RELEASE(job->map);
#if OPAL_ENABLE_FT == 1 #if OPAL_ENABLE_FT == 1
if (NULL != job->ckpt_snapshot_ref) { if (NULL != job->ckpt_snapshot_ref) {
free(job->ckpt_snapshot_ref); free(job->ckpt_snapshot_ref);
@ -635,7 +612,7 @@ static void orte_node_construct(orte_node_t* node)
static void orte_node_destruct(orte_node_t* node) static void orte_node_destruct(orte_node_t* node)
{ {
orte_vpid_t i; int i;
if (NULL != node->name) { if (NULL != node->name) {
free(node->name); free(node->name);
@ -645,10 +622,16 @@ static void orte_node_destruct(orte_node_t* node)
opal_argv_free(node->alias); opal_argv_free(node->alias);
} }
if (NULL != node->daemon) OBJ_RELEASE(node->daemon); if (NULL != node->daemon) {
node->daemon->node = NULL;
OBJ_RELEASE(node->daemon);
}
for (i=0; i < node->num_procs; i++) { for (i=0; i < node->procs->size; i++) {
if (NULL != node->procs->addr[i]) OBJ_RELEASE(node->procs->addr[i]); if (NULL != node->procs->addr[i]) {
((orte_proc_t*)(node->procs->addr[i]))->node = NULL;
OBJ_RELEASE(node->procs->addr[i]);
}
} }
OBJ_RELEASE(node->procs); OBJ_RELEASE(node->procs);

Просмотреть файл

@ -29,16 +29,6 @@
BEGIN_C_DECLS BEGIN_C_DECLS
#if !ORTE_DISABLE_FULL_SUPPORT
/**
* Initialize global storage for HNPs
*/
ORTE_DECLSPEC int orte_hnp_globals_init(void);
#endif /* !ORTE_DISABLE_FULL_SUPPORT */
/** /**
* Init the ORTE datatype support * Init the ORTE datatype support
*/ */

Просмотреть файл

@ -787,7 +787,6 @@ DONE:
static void job_completed(int trigpipe, short event, void *arg) static void job_completed(int trigpipe, short event, void *arg)
{ {
int rc; int rc;
orte_job_state_t exit_state;
orte_job_t *daemons; orte_job_t *daemons;
/* if the abort exit event is set, delete it */ /* if the abort exit event is set, delete it */
@ -796,8 +795,6 @@ static void job_completed(int trigpipe, short event, void *arg)
free(abort_exit_event); free(abort_exit_event);
} }
exit_state = jdata->state;
/* if we never launched, just skip this part to avoid /* if we never launched, just skip this part to avoid
* meaningless error messages * meaningless error messages
*/ */
@ -806,7 +803,7 @@ static void job_completed(int trigpipe, short event, void *arg)
goto DONE; goto DONE;
} }
if (ORTE_JOB_STATE_TERMINATED != exit_state) { if (0 != orte_exit_status) {
/* abnormal termination of some kind */ /* abnormal termination of some kind */
dump_aborted_procs(); dump_aborted_procs();
/* If we showed more abort messages than were allowed, /* If we showed more abort messages than were allowed,
@ -991,8 +988,8 @@ static void dump_aborted_procs(void)
jobs = (orte_job_t**)orte_job_data->addr; jobs = (orte_job_t**)orte_job_data->addr;
for (n=1; n < orte_job_data->size; n++) { for (n=1; n < orte_job_data->size; n++) {
if (NULL == jobs[n]) { if (NULL == jobs[n]) {
/* the array is left-justified, so we can quit on the first NULL */ /* the array is no longer left-justified, so we have to continue */
return; continue;
} }
if (ORTE_JOB_STATE_UNDEF != jobs[n]->state && if (ORTE_JOB_STATE_UNDEF != jobs[n]->state &&
ORTE_JOB_STATE_INIT != jobs[n]->state && ORTE_JOB_STATE_INIT != jobs[n]->state &&

Просмотреть файл

@ -828,7 +828,14 @@ int orte_util_encode_pidmap(opal_byte_object_t *boptr)
jobs = (orte_job_t**)orte_job_data->addr; jobs = (orte_job_t**)orte_job_data->addr;
/* for each job... */ /* for each job... */
for (j=0; j < orte_job_data->size && NULL != jobs[j]; j++) { for (j=1; j < orte_job_data->size; j++) {
/* the job array is no longer left-justified and may
* have holes in it as we recover resources at job
* completion
*/
if (NULL == jobs[j]) {
continue;
}
jdata = jobs[j]; jdata = jobs[j];
/* pack the jobid */ /* pack the jobid */
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &jdata->jobid, 1, ORTE_JOBID))) { if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &jdata->jobid, 1, ORTE_JOBID))) {