remove a potential locking conflict, and let emacs go ahead and reformat the function (sigh)
This commit was SVN r23331.
Этот коммит содержится в:
родитель
d463aec2f6
Коммит
f5548b8e0f
@ -184,171 +184,216 @@ static void process_msg(int fd, short event, void *data)
|
||||
}
|
||||
|
||||
switch (command) {
|
||||
case ORTE_PLM_LAUNCH_JOB_CMD:
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||
"%s plm:base:receive job launch command",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
case ORTE_PLM_LAUNCH_JOB_CMD:
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||
"%s plm:base:receive job launch command",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
/* unpack the job object */
|
||||
count = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(msgpkt->buffer, &jdata, &count, ORTE_JOB))) {
|
||||
/* unpack the job object */
|
||||
count = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(msgpkt->buffer, &jdata, &count, ORTE_JOB))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto ANSWER_LAUNCH;
|
||||
}
|
||||
|
||||
/* if is a LOCAL slave cmd */
|
||||
if (jdata->controls & ORTE_JOB_CONTROL_LOCAL_SLAVE) {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||
"%s plm:base:receive local launch",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
/* In this case, I cannot lookup job info. All I do is pass
|
||||
* this along to the local launcher, IF it is available
|
||||
*/
|
||||
if (NULL == orte_plm.spawn) {
|
||||
/* can't do this operation */
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_SUPPORTED);
|
||||
rc = ORTE_ERR_NOT_SUPPORTED;
|
||||
goto ANSWER_LAUNCH;
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto ANSWER_LAUNCH;
|
||||
}
|
||||
|
||||
/* if is a LOCAL slave cmd */
|
||||
if (jdata->controls & ORTE_JOB_CONTROL_LOCAL_SLAVE) {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||
"%s plm:base:receive local launch",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
/* In this case, I cannot lookup job info. All I do is pass
|
||||
* this along to the local launcher, IF it is available
|
||||
*/
|
||||
if (NULL == orte_plm.spawn) {
|
||||
/* can't do this operation */
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_SUPPORTED);
|
||||
rc = ORTE_ERR_NOT_SUPPORTED;
|
||||
goto ANSWER_LAUNCH;
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto ANSWER_LAUNCH;
|
||||
}
|
||||
job = jdata->jobid;
|
||||
} else { /* this is a GLOBAL launch cmd */
|
||||
/* get the parent's job object */
|
||||
if (NULL == (parent = orte_get_job_data_object(msgpkt->sender.jobid))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
goto ANSWER_LAUNCH;
|
||||
}
|
||||
|
||||
/* if the prefix was set in the parent's job, we need to transfer
|
||||
* that prefix to the child's app_context so any further launch of
|
||||
* orteds can find the correct binary. There always has to be at
|
||||
* least one app_context in both parent and child, so we don't
|
||||
* need to check that here. However, be sure not to overwrite
|
||||
* the prefix if the user already provided it!
|
||||
*/
|
||||
app = (orte_app_context_t*)opal_pointer_array_get_item(parent->apps, 0);
|
||||
child_app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, 0);
|
||||
if (NULL != app->prefix_dir &&
|
||||
NULL == child_app->prefix_dir) {
|
||||
child_app->prefix_dir = strdup(app->prefix_dir);
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||
"%s plm:base:receive adding hosts",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
/* process any add-hostfile and add-host options that were provided */
|
||||
if (ORTE_SUCCESS != (rc = orte_ras_base_add_hosts(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto ANSWER_LAUNCH;
|
||||
}
|
||||
|
||||
if( NULL == parent->bookmark ) {
|
||||
/* find the sender's node in the job map */
|
||||
if (NULL != (proc = (orte_proc_t*)opal_pointer_array_get_item(parent->procs, msgpkt->sender.vpid))) {
|
||||
/* set the bookmark so the child starts from that place - this means
|
||||
* that the first child process could be co-located with the proc
|
||||
* that called comm_spawn, assuming slots remain on that node. Otherwise,
|
||||
* the procs will start on the next available node
|
||||
*/
|
||||
jdata->bookmark = proc->node;
|
||||
}
|
||||
} else {
|
||||
jdata->bookmark = parent->bookmark;
|
||||
}
|
||||
|
||||
/* launch it */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||
"%s plm:base:receive calling spawn",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
OPAL_RELEASE_THREAD(&lock, &cond, &processing);
|
||||
if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto ANSWER_LAUNCH;
|
||||
}
|
||||
OPAL_ACQUIRE_THREAD(&lock, &cond, &processing);
|
||||
|
||||
job = jdata->jobid;
|
||||
|
||||
/* output debugger proctable, if requested */
|
||||
if (orte_output_debugger_proctable) {
|
||||
char *output;
|
||||
opal_dss.print(&output, NULL, jdata->map, ORTE_JOB_MAP);
|
||||
if (orte_xml_output) {
|
||||
fprintf(orte_xml_fp, "%s\n", output);
|
||||
fflush(orte_xml_fp);
|
||||
} else {
|
||||
opal_output(orte_clean_output, "%s", output);
|
||||
}
|
||||
free(output);
|
||||
}
|
||||
|
||||
/* return the favor so that any repetitive comm_spawns track each other */
|
||||
parent->bookmark = jdata->bookmark;
|
||||
job = jdata->jobid;
|
||||
} else { /* this is a GLOBAL launch cmd */
|
||||
/* get the parent's job object */
|
||||
if (NULL == (parent = orte_get_job_data_object(msgpkt->sender.jobid))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
goto ANSWER_LAUNCH;
|
||||
}
|
||||
|
||||
/* if the child is an ORTE job, wait for the procs to report they are alive */
|
||||
if (!(jdata->controls & ORTE_JOB_CONTROL_NON_ORTE_JOB)) {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||
"%s plm:base:receive waiting for procs to report",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
OPAL_RELEASE_THREAD(&lock, &cond, &processing);
|
||||
/* we will wait here until the thread is released,
|
||||
* indicating that all procs have reported
|
||||
*/
|
||||
OPAL_ACQUIRE_THREAD(&jdata->reported_lock,
|
||||
&jdata->reported_cond,
|
||||
&jdata->not_reported);
|
||||
OPAL_THREAD_UNLOCK(&jdata->reported_lock);
|
||||
OPAL_ACQUIRE_THREAD(&lock, &cond, &processing);
|
||||
|
||||
/* if the prefix was set in the parent's job, we need to transfer
|
||||
* that prefix to the child's app_context so any further launch of
|
||||
* orteds can find the correct binary. There always has to be at
|
||||
* least one app_context in both parent and child, so we don't
|
||||
* need to check that here. However, be sure not to overwrite
|
||||
* the prefix if the user already provided it!
|
||||
*/
|
||||
app = (orte_app_context_t*)opal_pointer_array_get_item(parent->apps, 0);
|
||||
child_app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, 0);
|
||||
if (NULL != app->prefix_dir &&
|
||||
NULL == child_app->prefix_dir) {
|
||||
child_app->prefix_dir = strdup(app->prefix_dir);
|
||||
}
|
||||
|
||||
ANSWER_LAUNCH:
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||
"%s plm:base:receive job %s launched",
|
||||
"%s plm:base:receive adding hosts",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
/* process any add-hostfile and add-host options that were provided */
|
||||
if (ORTE_SUCCESS != (rc = orte_ras_base_add_hosts(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto ANSWER_LAUNCH;
|
||||
}
|
||||
|
||||
if( NULL == parent->bookmark ) {
|
||||
/* find the sender's node in the job map */
|
||||
if (NULL != (proc = (orte_proc_t*)opal_pointer_array_get_item(parent->procs, msgpkt->sender.vpid))) {
|
||||
/* set the bookmark so the child starts from that place - this means
|
||||
* that the first child process could be co-located with the proc
|
||||
* that called comm_spawn, assuming slots remain on that node. Otherwise,
|
||||
* the procs will start on the next available node
|
||||
*/
|
||||
jdata->bookmark = proc->node;
|
||||
}
|
||||
} else {
|
||||
jdata->bookmark = parent->bookmark;
|
||||
}
|
||||
|
||||
/* launch it */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||
"%s plm:base:receive calling spawn",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
OPAL_RELEASE_THREAD(&lock, &cond, &processing);
|
||||
if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto ANSWER_LAUNCH;
|
||||
}
|
||||
OPAL_ACQUIRE_THREAD(&lock, &cond, &processing);
|
||||
|
||||
job = jdata->jobid;
|
||||
|
||||
/* output debugger proctable, if requested */
|
||||
if (orte_output_debugger_proctable) {
|
||||
char *output;
|
||||
opal_dss.print(&output, NULL, jdata->map, ORTE_JOB_MAP);
|
||||
if (orte_xml_output) {
|
||||
fprintf(orte_xml_fp, "%s\n", output);
|
||||
fflush(orte_xml_fp);
|
||||
} else {
|
||||
opal_output(orte_clean_output, "%s", output);
|
||||
}
|
||||
free(output);
|
||||
}
|
||||
|
||||
/* return the favor so that any repetitive comm_spawns track each other */
|
||||
parent->bookmark = jdata->bookmark;
|
||||
}
|
||||
|
||||
/* if the child is an ORTE job, wait for the procs to report they are alive */
|
||||
if (!(jdata->controls & ORTE_JOB_CONTROL_NON_ORTE_JOB)) {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||
"%s plm:base:receive waiting for procs to report",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
OPAL_RELEASE_THREAD(&lock, &cond, &processing);
|
||||
/* we will wait here until the thread is released,
|
||||
* indicating that all procs have reported
|
||||
*/
|
||||
OPAL_ACQUIRE_THREAD(&jdata->reported_lock,
|
||||
&jdata->reported_cond,
|
||||
&jdata->not_reported);
|
||||
OPAL_THREAD_UNLOCK(&jdata->reported_lock);
|
||||
OPAL_ACQUIRE_THREAD(&lock, &cond, &processing);
|
||||
}
|
||||
|
||||
ANSWER_LAUNCH:
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||
"%s plm:base:receive job %s launched",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(job)));
|
||||
|
||||
/* pack the jobid to be returned */
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(&answer, &job, 1, ORTE_JOBID))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
}
|
||||
|
||||
/* send the response back to the sender */
|
||||
if (0 > (ret = orte_rml.send_buffer(&msgpkt->sender, &answer, ORTE_RML_TAG_PLM_PROXY, 0))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
}
|
||||
break;
|
||||
|
||||
case ORTE_PLM_UPDATE_PROC_STATE:
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||
"%s plm:base:receive update proc state command from %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&(msgpkt->sender)) ));
|
||||
count = 1;
|
||||
running = false;
|
||||
while (ORTE_SUCCESS == (rc = opal_dss.unpack(msgpkt->buffer, &job, &count, ORTE_JOBID))) {
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||
"%s plm:base:receive got update_proc_state for job %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(job)));
|
||||
|
||||
/* pack the jobid to be returned */
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(&answer, &job, 1, ORTE_JOBID))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
}
|
||||
|
||||
/* send the response back to the sender */
|
||||
if (0 > (ret = orte_rml.send_buffer(&msgpkt->sender, &answer, ORTE_RML_TAG_PLM_PROXY, 0))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
}
|
||||
break;
|
||||
|
||||
case ORTE_PLM_UPDATE_PROC_STATE:
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||
"%s plm:base:receive update proc state command from %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&(msgpkt->sender)) ));
|
||||
count = 1;
|
||||
|
||||
name.jobid = job;
|
||||
running = false;
|
||||
while (ORTE_SUCCESS == (rc = opal_dss.unpack(msgpkt->buffer, &job, &count, ORTE_JOBID))) {
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||
"%s plm:base:receive got update_proc_state for job %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(job)));
|
||||
|
||||
name.jobid = job;
|
||||
running = false;
|
||||
/* get the job object */
|
||||
if (NULL == (jdata = orte_get_job_data_object(job))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
/* get the job object */
|
||||
if (NULL == (jdata = orte_get_job_data_object(job))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
goto CLEANUP;
|
||||
}
|
||||
/* if we are timing, the daemon will have included the time it
|
||||
* recvd the launch msg - the maximum time between when we sent
|
||||
* that message and a daemon recvd it tells us the time reqd
|
||||
* to wireup the daemon comm network
|
||||
*/
|
||||
if (orte_timing) {
|
||||
int64_t tmpsec, tmpusec;
|
||||
count = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(msgpkt->buffer, &tmpsec, &count, OPAL_INT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto CLEANUP;
|
||||
}
|
||||
/* if we are timing, the daemon will have included the time it
|
||||
* recvd the launch msg - the maximum time between when we sent
|
||||
* that message and a daemon recvd it tells us the time reqd
|
||||
* to wireup the daemon comm network
|
||||
*/
|
||||
count = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(msgpkt->buffer, &tmpusec, &count, OPAL_INT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto CLEANUP;
|
||||
}
|
||||
/* keep the maximum time */
|
||||
if (tmpsec > jdata->max_launch_msg_recvd.tv_sec) {
|
||||
jdata->max_launch_msg_recvd.tv_sec = tmpsec;
|
||||
jdata->max_launch_msg_recvd.tv_usec = tmpusec;
|
||||
} else if (tmpsec == jdata->max_launch_msg_recvd.tv_sec &&
|
||||
tmpusec > jdata->max_launch_msg_recvd.tv_usec) {
|
||||
jdata->max_launch_msg_recvd.tv_usec = tmpusec;
|
||||
}
|
||||
if (orte_timing_details) {
|
||||
int64_t sec, usec;
|
||||
char *timestr;
|
||||
ORTE_COMPUTE_TIME_DIFF(sec, usec, jdata->launch_msg_sent.tv_sec, jdata->launch_msg_sent.tv_usec,
|
||||
tmpsec, tmpusec);
|
||||
timestr = orte_pretty_print_timing(sec, usec);
|
||||
fprintf(orte_timing_output, "Time for launch msg to reach daemon %s: %s\n",
|
||||
ORTE_VPID_PRINT(msgpkt->sender.vpid), timestr);
|
||||
free(timestr);
|
||||
}
|
||||
}
|
||||
count = 1;
|
||||
while (ORTE_SUCCESS == (rc = opal_dss.unpack(msgpkt->buffer, &vpid, &count, ORTE_VPID))) {
|
||||
if (ORTE_VPID_INVALID == vpid) {
|
||||
/* flag indicates that this job is complete - move on */
|
||||
break;
|
||||
}
|
||||
name.vpid = vpid;
|
||||
/* unpack the pid */
|
||||
count = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(msgpkt->buffer, &pid, &count, OPAL_PID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto CLEANUP;
|
||||
}
|
||||
/* if we are timing things, unpack the time this proc was started */
|
||||
if (orte_timing) {
|
||||
int64_t tmpsec, tmpusec;
|
||||
count = 1;
|
||||
@ -361,135 +406,92 @@ static void process_msg(int fd, short event, void *data)
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto CLEANUP;
|
||||
}
|
||||
/* keep the maximum time */
|
||||
if (tmpsec > jdata->max_launch_msg_recvd.tv_sec) {
|
||||
jdata->max_launch_msg_recvd.tv_sec = tmpsec;
|
||||
jdata->max_launch_msg_recvd.tv_usec = tmpusec;
|
||||
} else if (tmpsec == jdata->max_launch_msg_recvd.tv_sec &&
|
||||
tmpusec > jdata->max_launch_msg_recvd.tv_usec) {
|
||||
jdata->max_launch_msg_recvd.tv_usec = tmpusec;
|
||||
}
|
||||
if (orte_timing_details) {
|
||||
int64_t sec, usec;
|
||||
char *timestr;
|
||||
ORTE_COMPUTE_TIME_DIFF(sec, usec, jdata->launch_msg_sent.tv_sec, jdata->launch_msg_sent.tv_usec,
|
||||
tmpsec, tmpusec);
|
||||
timestr = orte_pretty_print_timing(sec, usec);
|
||||
fprintf(orte_timing_output, "Time for launch msg to reach daemon %s: %s\n",
|
||||
ORTE_VPID_PRINT(msgpkt->sender.vpid), timestr);
|
||||
free(timestr);
|
||||
time_t tmptime;
|
||||
char *tmpstr;
|
||||
tmptime = tmpsec;
|
||||
tmpstr = ctime(&tmptime);
|
||||
/* remove the newline and the year at the end */
|
||||
tmpstr[strlen(tmpstr)-6] = '\0';
|
||||
fprintf(orte_timing_output, "Time rank %s was launched: %s.%3lu\n",
|
||||
ORTE_VPID_PRINT(vpid), tmpstr, (unsigned long)(tmpusec/1000));
|
||||
}
|
||||
}
|
||||
/* unpack the state */
|
||||
count = 1;
|
||||
while (ORTE_SUCCESS == (rc = opal_dss.unpack(msgpkt->buffer, &vpid, &count, ORTE_VPID))) {
|
||||
if (ORTE_VPID_INVALID == vpid) {
|
||||
/* flag indicates that this job is complete - move on */
|
||||
break;
|
||||
}
|
||||
name.vpid = vpid;
|
||||
/* unpack the pid */
|
||||
count = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(msgpkt->buffer, &pid, &count, OPAL_PID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto CLEANUP;
|
||||
}
|
||||
/* if we are timing things, unpack the time this proc was started */
|
||||
if (orte_timing) {
|
||||
int64_t tmpsec, tmpusec;
|
||||
count = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(msgpkt->buffer, &tmpsec, &count, OPAL_INT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto CLEANUP;
|
||||
}
|
||||
count = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(msgpkt->buffer, &tmpusec, &count, OPAL_INT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto CLEANUP;
|
||||
}
|
||||
if (orte_timing_details) {
|
||||
time_t tmptime;
|
||||
char *tmpstr;
|
||||
tmptime = tmpsec;
|
||||
tmpstr = ctime(&tmptime);
|
||||
/* remove the newline and the year at the end */
|
||||
tmpstr[strlen(tmpstr)-6] = '\0';
|
||||
fprintf(orte_timing_output, "Time rank %s was launched: %s.%3lu\n",
|
||||
ORTE_VPID_PRINT(vpid), tmpstr, (unsigned long)(tmpusec/1000));
|
||||
}
|
||||
}
|
||||
/* unpack the state */
|
||||
count = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(msgpkt->buffer, &state, &count, ORTE_PROC_STATE))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto CLEANUP;
|
||||
}
|
||||
if (ORTE_PROC_STATE_RUNNING == state) {
|
||||
running = true;
|
||||
}
|
||||
/* unpack the exit code */
|
||||
count = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(msgpkt->buffer, &exit_code, &count, ORTE_EXIT_CODE))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||
"%s plm:base:receive got update_proc_state for vpid %lu state %s exit_code %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(unsigned long)vpid, orte_proc_state_to_str(state), (int)exit_code));
|
||||
|
||||
/* update the state */
|
||||
orte_errmgr.update_state(job, ORTE_JOB_STATE_UNDEF,
|
||||
&name, state, pid, exit_code);
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(msgpkt->buffer, &state, &count, ORTE_PROC_STATE))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto CLEANUP;
|
||||
}
|
||||
if (ORTE_PROC_STATE_RUNNING == state) {
|
||||
running = true;
|
||||
}
|
||||
/* unpack the exit code */
|
||||
count = 1;
|
||||
}
|
||||
if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER != OPAL_SOS_GET_ERROR_CODE(rc)) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
} else {
|
||||
rc = ORTE_SUCCESS;
|
||||
}
|
||||
jdata->num_daemons_reported++;
|
||||
if (orte_report_launch_progress && running) {
|
||||
if (0 == jdata->num_daemons_reported % 100 || jdata->num_daemons_reported == orte_process_info.num_procs) {
|
||||
opal_output(orte_clean_output, "Reported: %d (out of %d) daemons - %d (out of %d) procs",
|
||||
(int)jdata->num_daemons_reported, (int)orte_process_info.num_procs,
|
||||
(int)jdata->num_launched, (int)jdata->num_procs);
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(msgpkt->buffer, &exit_code, &count, ORTE_EXIT_CODE))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||
"%s plm:base:receive got update_proc_state for vpid %lu state %s exit_code %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(unsigned long)vpid, orte_proc_state_to_str(state), (int)exit_code));
|
||||
|
||||
/* update the state */
|
||||
OPAL_RELEASE_THREAD(&lock, &cond, &processing);
|
||||
orte_errmgr.update_state(job, ORTE_JOB_STATE_UNDEF,
|
||||
&name, state, pid, exit_code);
|
||||
OPAL_ACQUIRE_THREAD(&lock, &cond, &processing);
|
||||
}
|
||||
break;
|
||||
count = 1;
|
||||
}
|
||||
if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER != OPAL_SOS_GET_ERROR_CODE(rc)) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
} else {
|
||||
rc = ORTE_SUCCESS;
|
||||
}
|
||||
jdata->num_daemons_reported++;
|
||||
if (orte_report_launch_progress && running) {
|
||||
if (0 == jdata->num_daemons_reported % 100 || jdata->num_daemons_reported == orte_process_info.num_procs) {
|
||||
opal_output(orte_clean_output, "Reported: %d (out of %d) daemons - %d (out of %d) procs",
|
||||
(int)jdata->num_daemons_reported, (int)orte_process_info.num_procs,
|
||||
(int)jdata->num_launched, (int)jdata->num_procs);
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case ORTE_PLM_INIT_ROUTES_CMD:
|
||||
case ORTE_PLM_INIT_ROUTES_CMD:
|
||||
count=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(msgpkt->buffer, &job, &count, ORTE_JOBID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto CLEANUP;
|
||||
}
|
||||
name.jobid = job;
|
||||
count=1;
|
||||
while (ORTE_SUCCESS == opal_dss.unpack(msgpkt->buffer, &vpid, &count, ORTE_VPID)) {
|
||||
if (ORTE_VPID_INVALID == vpid) {
|
||||
break;
|
||||
}
|
||||
name.vpid = vpid;
|
||||
/* update the errmgr state */
|
||||
orte_errmgr.update_state(job, ORTE_JOB_STATE_REGISTERED,
|
||||
&name, ORTE_PROC_STATE_REGISTERED,
|
||||
0, ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
count=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(msgpkt->buffer, &job, &count, ORTE_JOBID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto CLEANUP;
|
||||
}
|
||||
name.jobid = job;
|
||||
count=1;
|
||||
while (ORTE_SUCCESS == opal_dss.unpack(msgpkt->buffer, &vpid, &count, ORTE_VPID)) {
|
||||
if (ORTE_VPID_INVALID == vpid) {
|
||||
break;
|
||||
}
|
||||
name.vpid = vpid;
|
||||
/* update the errmgr state */
|
||||
orte_errmgr.update_state(job, ORTE_JOB_STATE_REGISTERED,
|
||||
&name, ORTE_PROC_STATE_REGISTERED,
|
||||
0, ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
count=1;
|
||||
}
|
||||
/* pass the remainder of the buffer to the active module's
|
||||
* init_routes API
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_routed.init_routes(job, msgpkt->buffer))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
break;
|
||||
}
|
||||
/* pass the remainder of the buffer to the active module's
|
||||
* init_routes API
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_routed.init_routes(job, msgpkt->buffer))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS);
|
||||
rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS;
|
||||
break;
|
||||
default:
|
||||
ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS);
|
||||
rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS;
|
||||
break;
|
||||
}
|
||||
|
||||
CLEANUP:
|
||||
@ -501,7 +503,7 @@ static void process_msg(int fd, short event, void *data)
|
||||
}
|
||||
}
|
||||
|
||||
DEPART:
|
||||
DEPART:
|
||||
/* release the thread */
|
||||
OPAL_RELEASE_THREAD(&lock, &cond, &processing);
|
||||
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user