1
1

Enable spawn of local slave processes - plm module implementation to follow

This commit was SVN r20466.
Этот коммит содержится в:
Ralph Castain 2009-02-06 15:31:33 +00:00
родитель f8cd188367
Коммит 13749673ed
2 изменённых файлов: 54 добавлений и 45 удалений

Просмотреть файл

@ -51,7 +51,7 @@ int orte_plm_proxy_spawn(orte_job_t *jdata)
/* setup the buffer */ /* setup the buffer */
OBJ_CONSTRUCT(&buf, opal_buffer_t); OBJ_CONSTRUCT(&buf, opal_buffer_t);
/* tell the HNP we are sending a launch request */ /* tell the recipient we are sending a launch request */
command = ORTE_PLM_LAUNCH_JOB_CMD; command = ORTE_PLM_LAUNCH_JOB_CMD;
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &command, 1, ORTE_PLM_CMD))) { if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &command, 1, ORTE_PLM_CMD))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
@ -67,10 +67,7 @@ int orte_plm_proxy_spawn(orte_job_t *jdata)
/* identify who gets this command - the HNP or the local orted */ /* identify who gets this command - the HNP or the local orted */
if (jdata->controls & ORTE_JOB_CONTROL_LOCAL_SLAVE) { if (jdata->controls & ORTE_JOB_CONTROL_LOCAL_SLAVE) {
/* for now, this is unsupported */ target = ORTE_PROC_MY_DAEMON;
opal_output(0, "LOCAL DAEMON SPAWN IS CURRENTLY UNSUPPORTED");
target = ORTE_PROC_MY_HNP;
/* target = ORTE_PROC_MY_DAEMON; */
} else { } else {
target = ORTE_PROC_MY_HNP; target = ORTE_PROC_MY_HNP;
} }

Просмотреть файл

@ -128,51 +128,63 @@ void orte_plm_base_receive_process_msg(int fd, short event, void *data)
OBJ_CONSTRUCT(&answer, opal_buffer_t); OBJ_CONSTRUCT(&answer, opal_buffer_t);
job = ORTE_JOBID_INVALID; job = ORTE_JOBID_INVALID;
/* get the job object */ /* unpack the job object */
count = 1; count = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(mev->buffer, &jdata, &count, ORTE_JOB))) { if (ORTE_SUCCESS != (rc = opal_dss.unpack(mev->buffer, &jdata, &count, ORTE_JOB))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
goto ANSWER_LAUNCH; goto ANSWER_LAUNCH;
} }
/* get the parent's job object */ /* if is a LOCAL slave cmd */
if (NULL == (parent = orte_get_job_data_object(mev->sender.jobid))) { if (jdata->controls & ORTE_JOB_CONTROL_LOCAL_SLAVE) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); /* In this case, I cannot lookup job info. All I do is pass
goto ANSWER_LAUNCH; * this along to the local launcher
} */
if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdata))) {
ORTE_ERROR_LOG(rc);
goto ANSWER_LAUNCH;
}
job = jdata->jobid;
} else { /* this is a GLOBAL launch cmd */
/* get the parent's job object */
if (NULL == (parent = orte_get_job_data_object(mev->sender.jobid))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
goto ANSWER_LAUNCH;
}
/* if the prefix was set in the parent's job, we need to transfer /* if the prefix was set in the parent's job, we need to transfer
* that prefix to the child's app_context so any further launch of * that prefix to the child's app_context so any further launch of
* orteds can find the correct binary. There always has to be at * orteds can find the correct binary. There always has to be at
* least one app_context in both parent and child, so we don't * least one app_context in both parent and child, so we don't
* need to check that here. However, be sure not to overwrite * need to check that here. However, be sure not to overwrite
* the prefix if the user already provide it! * the prefix if the user already provide it!
*/ */
apps = (orte_app_context_t**)parent->apps->addr; apps = (orte_app_context_t**)parent->apps->addr;
child_apps = (orte_app_context_t**)jdata->apps->addr; child_apps = (orte_app_context_t**)jdata->apps->addr;
if (NULL != apps[0]->prefix_dir && if (NULL != apps[0]->prefix_dir &&
NULL == child_apps[0]->prefix_dir) { NULL == child_apps[0]->prefix_dir) {
child_apps[0]->prefix_dir = strdup(apps[0]->prefix_dir); child_apps[0]->prefix_dir = strdup(apps[0]->prefix_dir);
} }
/* find the sender's node in the job map */ /* find the sender's node in the job map */
procs = (orte_proc_t**)parent->procs->addr; procs = (orte_proc_t**)parent->procs->addr;
/* set the bookmark so the child starts from that place - this means /* set the bookmark so the child starts from that place - this means
* that the first child process could be co-located with the proc * that the first child process could be co-located with the proc
* that called comm_spawn, assuming slots remain on that node. Otherwise, * that called comm_spawn, assuming slots remain on that node. Otherwise,
* the procs will start on the next available node * the procs will start on the next available node
*/ */
jdata->bookmark = procs[mev->sender.vpid]->node; jdata->bookmark = procs[mev->sender.vpid]->node;
/* launch it */ /* launch it */
if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdata))) { if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdata))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
goto ANSWER_LAUNCH; goto ANSWER_LAUNCH;
} }
job = jdata->jobid; job = jdata->jobid;
/* return the favor so that any repetitive comm_spawns track each other */ /* return the favor so that any repetitive comm_spawns track each other */
parent->bookmark = jdata->bookmark; parent->bookmark = jdata->bookmark;
}
/* if the child is an ORTE job, wait for the procs to report they are alive */ /* if the child is an ORTE job, wait for the procs to report they are alive */
if (!(jdata->controls & ORTE_JOB_CONTROL_NON_ORTE_JOB)) { if (!(jdata->controls & ORTE_JOB_CONTROL_NON_ORTE_JOB)) {