Enable spawn of local slave processes - plm module implementation to follow
This commit was SVN r20466.
Этот коммит содержится в:
родитель
f8cd188367
Коммит
13749673ed
@ -51,7 +51,7 @@ int orte_plm_proxy_spawn(orte_job_t *jdata)
|
|||||||
/* setup the buffer */
|
/* setup the buffer */
|
||||||
OBJ_CONSTRUCT(&buf, opal_buffer_t);
|
OBJ_CONSTRUCT(&buf, opal_buffer_t);
|
||||||
|
|
||||||
/* tell the HNP we are sending a launch request */
|
/* tell the recipient we are sending a launch request */
|
||||||
command = ORTE_PLM_LAUNCH_JOB_CMD;
|
command = ORTE_PLM_LAUNCH_JOB_CMD;
|
||||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &command, 1, ORTE_PLM_CMD))) {
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &command, 1, ORTE_PLM_CMD))) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
@ -67,10 +67,7 @@ int orte_plm_proxy_spawn(orte_job_t *jdata)
|
|||||||
|
|
||||||
/* identify who gets this command - the HNP or the local orted */
|
/* identify who gets this command - the HNP or the local orted */
|
||||||
if (jdata->controls & ORTE_JOB_CONTROL_LOCAL_SLAVE) {
|
if (jdata->controls & ORTE_JOB_CONTROL_LOCAL_SLAVE) {
|
||||||
/* for now, this is unsupported */
|
target = ORTE_PROC_MY_DAEMON;
|
||||||
opal_output(0, "LOCAL DAEMON SPAWN IS CURRENTLY UNSUPPORTED");
|
|
||||||
target = ORTE_PROC_MY_HNP;
|
|
||||||
/* target = ORTE_PROC_MY_DAEMON; */
|
|
||||||
} else {
|
} else {
|
||||||
target = ORTE_PROC_MY_HNP;
|
target = ORTE_PROC_MY_HNP;
|
||||||
}
|
}
|
||||||
|
@ -128,51 +128,63 @@ void orte_plm_base_receive_process_msg(int fd, short event, void *data)
|
|||||||
OBJ_CONSTRUCT(&answer, opal_buffer_t);
|
OBJ_CONSTRUCT(&answer, opal_buffer_t);
|
||||||
job = ORTE_JOBID_INVALID;
|
job = ORTE_JOBID_INVALID;
|
||||||
|
|
||||||
/* get the job object */
|
/* unpack the job object */
|
||||||
count = 1;
|
count = 1;
|
||||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(mev->buffer, &jdata, &count, ORTE_JOB))) {
|
if (ORTE_SUCCESS != (rc = opal_dss.unpack(mev->buffer, &jdata, &count, ORTE_JOB))) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
goto ANSWER_LAUNCH;
|
goto ANSWER_LAUNCH;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* get the parent's job object */
|
/* if is a LOCAL slave cmd */
|
||||||
if (NULL == (parent = orte_get_job_data_object(mev->sender.jobid))) {
|
if (jdata->controls & ORTE_JOB_CONTROL_LOCAL_SLAVE) {
|
||||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
/* In this case, I cannot lookup job info. All I do is pass
|
||||||
goto ANSWER_LAUNCH;
|
* this along to the local launcher
|
||||||
}
|
*/
|
||||||
|
if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdata))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
goto ANSWER_LAUNCH;
|
||||||
|
}
|
||||||
|
job = jdata->jobid;
|
||||||
|
} else { /* this is a GLOBAL launch cmd */
|
||||||
|
/* get the parent's job object */
|
||||||
|
if (NULL == (parent = orte_get_job_data_object(mev->sender.jobid))) {
|
||||||
|
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||||
|
goto ANSWER_LAUNCH;
|
||||||
|
}
|
||||||
|
|
||||||
/* if the prefix was set in the parent's job, we need to transfer
|
/* if the prefix was set in the parent's job, we need to transfer
|
||||||
* that prefix to the child's app_context so any further launch of
|
* that prefix to the child's app_context so any further launch of
|
||||||
* orteds can find the correct binary. There always has to be at
|
* orteds can find the correct binary. There always has to be at
|
||||||
* least one app_context in both parent and child, so we don't
|
* least one app_context in both parent and child, so we don't
|
||||||
* need to check that here. However, be sure not to overwrite
|
* need to check that here. However, be sure not to overwrite
|
||||||
* the prefix if the user already provide it!
|
* the prefix if the user already provide it!
|
||||||
*/
|
*/
|
||||||
apps = (orte_app_context_t**)parent->apps->addr;
|
apps = (orte_app_context_t**)parent->apps->addr;
|
||||||
child_apps = (orte_app_context_t**)jdata->apps->addr;
|
child_apps = (orte_app_context_t**)jdata->apps->addr;
|
||||||
if (NULL != apps[0]->prefix_dir &&
|
if (NULL != apps[0]->prefix_dir &&
|
||||||
NULL == child_apps[0]->prefix_dir) {
|
NULL == child_apps[0]->prefix_dir) {
|
||||||
child_apps[0]->prefix_dir = strdup(apps[0]->prefix_dir);
|
child_apps[0]->prefix_dir = strdup(apps[0]->prefix_dir);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* find the sender's node in the job map */
|
/* find the sender's node in the job map */
|
||||||
procs = (orte_proc_t**)parent->procs->addr;
|
procs = (orte_proc_t**)parent->procs->addr;
|
||||||
/* set the bookmark so the child starts from that place - this means
|
/* set the bookmark so the child starts from that place - this means
|
||||||
* that the first child process could be co-located with the proc
|
* that the first child process could be co-located with the proc
|
||||||
* that called comm_spawn, assuming slots remain on that node. Otherwise,
|
* that called comm_spawn, assuming slots remain on that node. Otherwise,
|
||||||
* the procs will start on the next available node
|
* the procs will start on the next available node
|
||||||
*/
|
*/
|
||||||
jdata->bookmark = procs[mev->sender.vpid]->node;
|
jdata->bookmark = procs[mev->sender.vpid]->node;
|
||||||
|
|
||||||
/* launch it */
|
/* launch it */
|
||||||
if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdata))) {
|
if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdata))) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
goto ANSWER_LAUNCH;
|
goto ANSWER_LAUNCH;
|
||||||
}
|
}
|
||||||
job = jdata->jobid;
|
job = jdata->jobid;
|
||||||
|
|
||||||
/* return the favor so that any repetitive comm_spawns track each other */
|
/* return the favor so that any repetitive comm_spawns track each other */
|
||||||
parent->bookmark = jdata->bookmark;
|
parent->bookmark = jdata->bookmark;
|
||||||
|
}
|
||||||
|
|
||||||
/* if the child is an ORTE job, wait for the procs to report they are alive */
|
/* if the child is an ORTE job, wait for the procs to report they are alive */
|
||||||
if (!(jdata->controls & ORTE_JOB_CONTROL_NON_ORTE_JOB)) {
|
if (!(jdata->controls & ORTE_JOB_CONTROL_NON_ORTE_JOB)) {
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user