Detect the scenario where one or more procs fail to call orte/ompi_init while others in the job do. This scenario can cause the job to hang as MPI_Init contains a barrier operation that will not complete. Although ORTE does not contain such a barrier, it still will be considered as an error scenario so that we can detect the MPI case - otherwise, ORTE has no knowledge of OMPI and wouldn't know how to differentiate the use-cases.
Take advantage of the changes to update the routed_base_receive code to avoid message overlap. This commit was SVN r22329.
Этот коммит содержится в:
родитель
06d1f2cfe2
Коммит
8ab962411c
@ -2057,8 +2057,21 @@ static bool all_children_registered(orte_jobid_t job)
|
|||||||
|
|
||||||
/* is this child part of the specified job? */
|
/* is this child part of the specified job? */
|
||||||
if (OPAL_EQUAL == opal_dss.compare(&child->name->jobid, &job, ORTE_JOBID)) {
|
if (OPAL_EQUAL == opal_dss.compare(&child->name->jobid, &job, ORTE_JOBID)) {
|
||||||
|
/* if this child has terminated, we consider it as having
|
||||||
|
* registered for the purposes of this function. If it never
|
||||||
|
* did register, then we will send a NULL rml_uri back to
|
||||||
|
* the HNP, which will then know that the proc did not register.
|
||||||
|
* If other procs did register, then the HNP can declare an
|
||||||
|
* abnormal termination
|
||||||
|
*/
|
||||||
|
if (ORTE_PROC_STATE_UNTERMINATED < child->state) {
|
||||||
|
/* this proc has terminated somehow - consider it
|
||||||
|
* as registered for now
|
||||||
|
*/
|
||||||
|
continue;
|
||||||
|
}
|
||||||
/* if this child is *not* registered yet, return false */
|
/* if this child is *not* registered yet, return false */
|
||||||
if (NULL == child->rml_uri) {
|
if (!child->init_recvd) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -2084,6 +2097,11 @@ static int pack_child_contact_info(orte_jobid_t job, opal_buffer_t *buf)
|
|||||||
|
|
||||||
/* is this child part of the specified job? */
|
/* is this child part of the specified job? */
|
||||||
if (OPAL_EQUAL == opal_dss.compare(&child->name->jobid, &job, ORTE_JOBID)) {
|
if (OPAL_EQUAL == opal_dss.compare(&child->name->jobid, &job, ORTE_JOBID)) {
|
||||||
|
/* pack the child's vpid - must be done in case rml_uri is NULL */
|
||||||
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &(child->name->vpid), 1, ORTE_VPID))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
/* pack the contact info */
|
/* pack the contact info */
|
||||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &child->rml_uri, 1, OPAL_STRING))) {
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &child->rml_uri, 1, OPAL_STRING))) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
@ -2173,6 +2191,7 @@ int orte_odls_base_default_require_sync(orte_process_name_t *proc,
|
|||||||
int rc;
|
int rc;
|
||||||
bool found=false;
|
bool found=false;
|
||||||
int8_t flag;
|
int8_t flag;
|
||||||
|
orte_odls_job_t *jobdat, *jdat;
|
||||||
|
|
||||||
/* protect operations involving the global list of children */
|
/* protect operations involving the global list of children */
|
||||||
OPAL_THREAD_LOCK(&orte_odls_globals.mutex);
|
OPAL_THREAD_LOCK(&orte_odls_globals.mutex);
|
||||||
@ -2216,13 +2235,15 @@ int orte_odls_base_default_require_sync(orte_process_name_t *proc,
|
|||||||
/* if the contact info is already set, then we are "de-registering" the child
|
/* if the contact info is already set, then we are "de-registering" the child
|
||||||
* so free the info and set it to NULL
|
* so free the info and set it to NULL
|
||||||
*/
|
*/
|
||||||
if (NULL != child->rml_uri) {
|
if (child->init_recvd && NULL != child->rml_uri) {
|
||||||
free(child->rml_uri);
|
free(child->rml_uri);
|
||||||
child->rml_uri = NULL;
|
child->rml_uri = NULL;
|
||||||
|
child->fini_recvd = true;
|
||||||
} else {
|
} else {
|
||||||
/* if the contact info is not set, then we are registering the child so
|
/* if the contact info is not set, then we are registering the child so
|
||||||
* unpack the contact info from the buffer and store it
|
* unpack the contact info from the buffer and store it
|
||||||
*/
|
*/
|
||||||
|
child->init_recvd = true;
|
||||||
cnt = 1;
|
cnt = 1;
|
||||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buf, &(child->rml_uri), &cnt, OPAL_STRING))) {
|
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buf, &(child->rml_uri), &cnt, OPAL_STRING))) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
@ -2233,13 +2254,14 @@ int orte_odls_base_default_require_sync(orte_process_name_t *proc,
|
|||||||
OBJ_CONSTRUCT(&buffer, opal_buffer_t);
|
OBJ_CONSTRUCT(&buffer, opal_buffer_t);
|
||||||
/* do they want the nidmap? */
|
/* do they want the nidmap? */
|
||||||
if (drop_nidmap) {
|
if (drop_nidmap) {
|
||||||
orte_odls_job_t *jobdat = NULL;
|
|
||||||
/* get the jobdata object */
|
/* get the jobdata object */
|
||||||
|
jobdat = NULL;
|
||||||
for (item = opal_list_get_first(&orte_local_jobdata);
|
for (item = opal_list_get_first(&orte_local_jobdata);
|
||||||
item != opal_list_get_end(&orte_local_jobdata);
|
item != opal_list_get_end(&orte_local_jobdata);
|
||||||
item = opal_list_get_next(item)) {
|
item = opal_list_get_next(item)) {
|
||||||
jobdat = (orte_odls_job_t*)item;
|
jdat = (orte_odls_job_t*)item;
|
||||||
if (jobdat->jobid == child->name->jobid) {
|
if (jdat->jobid == child->name->jobid) {
|
||||||
|
jobdat = jdat;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -2247,6 +2269,7 @@ int orte_odls_base_default_require_sync(orte_process_name_t *proc,
|
|||||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||||
goto CLEANUP;
|
goto CLEANUP;
|
||||||
}
|
}
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||||
"%s odls:sync nidmap requested for job %s",
|
"%s odls:sync nidmap requested for job %s",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
@ -2577,7 +2600,7 @@ GOTCHILD:
|
|||||||
|
|
||||||
void orte_base_default_waitpid_fired(orte_process_name_t *proc, int32_t status)
|
void orte_base_default_waitpid_fired(orte_process_name_t *proc, int32_t status)
|
||||||
{
|
{
|
||||||
orte_odls_child_t *child;
|
orte_odls_child_t *child, *chd;
|
||||||
opal_list_item_t *item;
|
opal_list_item_t *item;
|
||||||
char *job, *vpid, *abort_file;
|
char *job, *vpid, *abort_file;
|
||||||
struct stat buf;
|
struct stat buf;
|
||||||
@ -2677,21 +2700,47 @@ GOTCHILD:
|
|||||||
/* okay, it terminated normally - check to see if a sync was required and
|
/* okay, it terminated normally - check to see if a sync was required and
|
||||||
* if it was received
|
* if it was received
|
||||||
*/
|
*/
|
||||||
if (NULL != child->rml_uri) {
|
if (child->init_recvd) {
|
||||||
/* if this is set, then we required a sync and didn't get it, so this
|
if (!child->fini_recvd) {
|
||||||
|
/* we required a finalizing sync and didn't get it, so this
|
||||||
* is considered an abnormal termination and treated accordingly
|
* is considered an abnormal termination and treated accordingly
|
||||||
*/
|
*/
|
||||||
child->state = ORTE_PROC_STATE_TERM_WO_SYNC;
|
child->state = ORTE_PROC_STATE_TERM_WO_SYNC;
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||||
"%s odls:waitpid_fired child process %s terminated normally "
|
"%s odls:waitpid_fired child process %s terminated normally "
|
||||||
"but did not provide a required sync - it "
|
"but did not provide a required finalize sync - it "
|
||||||
"will be treated as an abnormal termination",
|
"will be treated as an abnormal termination",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
ORTE_NAME_PRINT(child->name)));
|
ORTE_NAME_PRINT(child->name)));
|
||||||
|
|
||||||
goto MOVEON;
|
goto MOVEON;
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
|
/* has any child in this job already registered? */
|
||||||
|
for (item = opal_list_get_first(&orte_local_children);
|
||||||
|
item != opal_list_get_end(&orte_local_children);
|
||||||
|
item = opal_list_get_next(item)) {
|
||||||
|
chd = (orte_odls_child_t*)item;
|
||||||
|
|
||||||
|
if (chd->init_recvd) {
|
||||||
|
/* someone has registered, and we didn't before
|
||||||
|
* terminating - this is an abnormal termination
|
||||||
|
*/
|
||||||
|
child->state = ORTE_PROC_STATE_TERM_WO_SYNC;
|
||||||
|
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||||
|
"%s odls:waitpid_fired child process %s terminated normally "
|
||||||
|
"but did not provide a required init sync - it "
|
||||||
|
"will be treated as an abnormal termination",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
ORTE_NAME_PRINT(child->name)));
|
||||||
|
|
||||||
|
goto MOVEON;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/* if no child has registered, then it is possible that
|
||||||
|
* none of them will. This is considered acceptable
|
||||||
|
*/
|
||||||
child->state = ORTE_PROC_STATE_TERMINATED;
|
child->state = ORTE_PROC_STATE_TERMINATED;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2699,7 +2748,6 @@ GOTCHILD:
|
|||||||
"%s odls:waitpid_fired child process %s terminated normally",
|
"%s odls:waitpid_fired child process %s terminated normally",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
ORTE_NAME_PRINT(child->name)));
|
ORTE_NAME_PRINT(child->name)));
|
||||||
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
/* the process was terminated with a signal! That's definitely
|
/* the process was terminated with a signal! That's definitely
|
||||||
|
@ -85,6 +85,8 @@ static void orte_odls_child_constructor(orte_odls_child_t *ptr)
|
|||||||
*/
|
*/
|
||||||
ptr->state = ORTE_PROC_STATE_FAILED_TO_START;
|
ptr->state = ORTE_PROC_STATE_FAILED_TO_START;
|
||||||
ptr->exit_code = 0;
|
ptr->exit_code = 0;
|
||||||
|
ptr->init_recvd = false;
|
||||||
|
ptr->fini_recvd = false;
|
||||||
ptr->rml_uri = NULL;
|
ptr->rml_uri = NULL;
|
||||||
ptr->slot_list = NULL;
|
ptr->slot_list = NULL;
|
||||||
ptr->waitpid_recvd = false;
|
ptr->waitpid_recvd = false;
|
||||||
|
@ -91,6 +91,8 @@ typedef struct {
|
|||||||
bool coll_recvd; /* collective operation recvd */
|
bool coll_recvd; /* collective operation recvd */
|
||||||
orte_proc_state_t state; /* the state of the process */
|
orte_proc_state_t state; /* the state of the process */
|
||||||
orte_exit_code_t exit_code; /* process exit code */
|
orte_exit_code_t exit_code; /* process exit code */
|
||||||
|
bool init_recvd; /* process called orte_init */
|
||||||
|
bool fini_recvd; /* process called orte_finalize */
|
||||||
char *rml_uri; /* contact info for this child */
|
char *rml_uri; /* contact info for this child */
|
||||||
char *slot_list; /* list of slots for this child */
|
char *slot_list; /* list of slots for this child */
|
||||||
bool waitpid_recvd; /* waitpid has detected proc termination */
|
bool waitpid_recvd; /* waitpid has detected proc termination */
|
||||||
|
@ -81,6 +81,7 @@ ORTE_DECLSPEC int orte_plm_base_close(void);
|
|||||||
*/
|
*/
|
||||||
ORTE_DECLSPEC void orte_plm_base_app_report_launch(int fd, short event, void *data);
|
ORTE_DECLSPEC void orte_plm_base_app_report_launch(int fd, short event, void *data);
|
||||||
ORTE_DECLSPEC void orte_plm_base_receive_process_msg(int fd, short event, void *data);
|
ORTE_DECLSPEC void orte_plm_base_receive_process_msg(int fd, short event, void *data);
|
||||||
|
ORTE_DECLSPEC void orte_plm_base_check_job_completed(orte_job_t *jdata);
|
||||||
|
|
||||||
#endif /* ORTE_DISABLE_FULL_SUPPORT */
|
#endif /* ORTE_DISABLE_FULL_SUPPORT */
|
||||||
|
|
||||||
|
@ -1369,10 +1369,8 @@ void orte_plm_base_check_job_completed(orte_job_t *jdata)
|
|||||||
* that the user realizes there was an error, so in this -one- case,
|
* that the user realizes there was an error, so in this -one- case,
|
||||||
* we overwrite the process' exit code with the default error code
|
* we overwrite the process' exit code with the default error code
|
||||||
*/
|
*/
|
||||||
if (ORTE_PROC_STATE_TERM_WO_SYNC == proc->state) {
|
|
||||||
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
break;
|
break;
|
||||||
} else if (ORTE_PROC_STATE_KILLED_BY_CMD == proc->state) {
|
} else if (ORTE_PROC_STATE_KILLED_BY_CMD == proc->state) {
|
||||||
/* we ordered this proc to die, so it isn't an abnormal termination
|
/* we ordered this proc to die, so it isn't an abnormal termination
|
||||||
|
@ -34,8 +34,8 @@
|
|||||||
|
|
||||||
#include "opal/mca/mca.h"
|
#include "opal/mca/mca.h"
|
||||||
#include "opal/mca/base/mca_base_param.h"
|
#include "opal/mca/base/mca_base_param.h"
|
||||||
|
|
||||||
#include "opal/dss/dss.h"
|
#include "opal/dss/dss.h"
|
||||||
|
|
||||||
#include "orte/constants.h"
|
#include "orte/constants.h"
|
||||||
#include "orte/types.h"
|
#include "orte/types.h"
|
||||||
#include "orte/util/proc_info.h"
|
#include "orte/util/proc_info.h"
|
||||||
@ -129,7 +129,7 @@ int orte_plm_base_comm_stop(void)
|
|||||||
|
|
||||||
|
|
||||||
/* process incoming messages in order of receipt */
|
/* process incoming messages in order of receipt */
|
||||||
void process_msg(int fd, short event, void *data)
|
static void process_msg(int fd, short event, void *data)
|
||||||
{
|
{
|
||||||
orte_msg_packet_t *msgpkt;
|
orte_msg_packet_t *msgpkt;
|
||||||
orte_plm_cmd_flag_t command;
|
orte_plm_cmd_flag_t command;
|
||||||
|
@ -111,8 +111,6 @@ ORTE_DECLSPEC int orte_plm_base_report_launched(orte_jobid_t job);
|
|||||||
|
|
||||||
ORTE_DECLSPEC int orte_plm_base_daemon_callback(orte_std_cntr_t num_daemons);
|
ORTE_DECLSPEC int orte_plm_base_daemon_callback(orte_std_cntr_t num_daemons);
|
||||||
|
|
||||||
ORTE_DECLSPEC void orte_plm_base_check_job_completed(orte_job_t *jdata);
|
|
||||||
|
|
||||||
ORTE_DECLSPEC int orte_plm_base_set_hnp_name(void);
|
ORTE_DECLSPEC int orte_plm_base_set_hnp_name(void);
|
||||||
|
|
||||||
ORTE_DECLSPEC int orte_plm_base_create_jobid(orte_job_t *jdata);
|
ORTE_DECLSPEC int orte_plm_base_create_jobid(orte_job_t *jdata);
|
||||||
|
@ -37,6 +37,8 @@ ORTE_DECLSPEC extern int orte_routed_base_output;
|
|||||||
ORTE_DECLSPEC extern opal_list_t orte_routed_base_components;
|
ORTE_DECLSPEC extern opal_list_t orte_routed_base_components;
|
||||||
|
|
||||||
ORTE_DECLSPEC extern int orte_routed_base_register_sync(bool setup);
|
ORTE_DECLSPEC extern int orte_routed_base_register_sync(bool setup);
|
||||||
|
ORTE_DECLSPEC extern int orte_routed_base_process_callback(orte_jobid_t job,
|
||||||
|
opal_buffer_t *buffer);
|
||||||
|
|
||||||
ORTE_DECLSPEC int orte_routed_base_comm_start(void);
|
ORTE_DECLSPEC int orte_routed_base_comm_start(void);
|
||||||
ORTE_DECLSPEC int orte_routed_base_comm_stop(void);
|
ORTE_DECLSPEC int orte_routed_base_comm_stop(void);
|
||||||
|
@ -53,6 +53,13 @@
|
|||||||
#include "orte/mca/routed/base/base.h"
|
#include "orte/mca/routed/base/base.h"
|
||||||
|
|
||||||
static bool recv_issued=false;
|
static bool recv_issued=false;
|
||||||
|
static opal_mutex_t lock;
|
||||||
|
static opal_list_t recvs;
|
||||||
|
static opal_event_t ready;
|
||||||
|
static int ready_fd[2];
|
||||||
|
static bool processing;
|
||||||
|
|
||||||
|
static void process_msg(int fd, short event, void *data);
|
||||||
|
|
||||||
int orte_routed_base_comm_start(void)
|
int orte_routed_base_comm_start(void)
|
||||||
{
|
{
|
||||||
@ -66,6 +73,20 @@ int orte_routed_base_comm_start(void)
|
|||||||
"%s routed:base: Receive: Start command recv",
|
"%s routed:base: Receive: Start command recv",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||||
|
|
||||||
|
processing = false;
|
||||||
|
OBJ_CONSTRUCT(&lock, opal_mutex_t);
|
||||||
|
OBJ_CONSTRUCT(&recvs, opal_list_t);
|
||||||
|
#ifndef __WINDOWS__
|
||||||
|
pipe(ready_fd);
|
||||||
|
#else
|
||||||
|
if (evutil_socketpair(AF_UNIX, SOCK_STREAM, 0, ready_fd) == -1) {
|
||||||
|
return ORTE_ERROR;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
opal_event_set(&ready, ready_fd[0], OPAL_EV_READ, process_msg, NULL);
|
||||||
|
opal_event_add(&ready, 0);
|
||||||
|
|
||||||
if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
|
if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
|
||||||
ORTE_RML_TAG_INIT_ROUTES,
|
ORTE_RML_TAG_INIT_ROUTES,
|
||||||
ORTE_RML_NON_PERSISTENT,
|
ORTE_RML_NON_PERSISTENT,
|
||||||
@ -92,37 +113,73 @@ int orte_routed_base_comm_stop(void)
|
|||||||
"%s routed:base:receive stop comm",
|
"%s routed:base:receive stop comm",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||||
|
|
||||||
if (ORTE_SUCCESS != (rc = orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_INIT_ROUTES))) {
|
OBJ_DESTRUCT(&recvs);
|
||||||
ORTE_ERROR_LOG(rc);
|
opal_event_del(&ready);
|
||||||
}
|
#ifndef __WINDOWS__
|
||||||
|
close(ready_fd[0]);
|
||||||
|
#else
|
||||||
|
closesocket(ready_fd[0]);
|
||||||
|
#endif
|
||||||
|
processing = false;
|
||||||
|
OBJ_DESTRUCT(&lock);
|
||||||
|
|
||||||
|
orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_INIT_ROUTES);
|
||||||
recv_issued = false;
|
recv_issued = false;
|
||||||
|
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
|
||||||
void orte_routed_base_process_msg(int fd, short event, void *data)
|
static void process_msg(int fd, short event, void *data)
|
||||||
{
|
{
|
||||||
orte_message_event_t *mev = (orte_message_event_t*)data;
|
orte_msg_packet_t *msgpkt;
|
||||||
orte_jobid_t job;
|
orte_jobid_t job;
|
||||||
int rc;
|
int rc;
|
||||||
orte_std_cntr_t cnt;
|
orte_std_cntr_t cnt;
|
||||||
|
opal_list_item_t *item;
|
||||||
|
int dump[128];
|
||||||
|
|
||||||
|
OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output,
|
||||||
|
"%s routed:base:receive processing msg",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||||
|
|
||||||
|
OPAL_THREAD_LOCK(&lock);
|
||||||
|
|
||||||
|
/* tag that we are processing the list */
|
||||||
|
processing = true;
|
||||||
|
|
||||||
|
/* clear the file descriptor to stop the event from refiring */
|
||||||
|
#ifndef __WINDOWS__
|
||||||
|
read(fd, &dump, sizeof(dump));
|
||||||
|
#else
|
||||||
|
recv(fd, (char *) &dump, sizeof(dump), 0);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
while (NULL != (item = opal_list_remove_first(&recvs))) {
|
||||||
|
msgpkt = (orte_msg_packet_t*)item;
|
||||||
|
|
||||||
/* unpack the jobid this is for */
|
/* unpack the jobid this is for */
|
||||||
cnt=1;
|
cnt=1;
|
||||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(mev->buffer, &job, &cnt, ORTE_JOBID))) {
|
if (ORTE_SUCCESS != (rc = opal_dss.unpack(msgpkt->buffer, &job, &cnt, ORTE_JOBID))) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
OBJ_RELEASE(mev);
|
OBJ_RELEASE(msgpkt);
|
||||||
return;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* pass the remainder of the buffer to the active module's
|
/* pass the remainder of the buffer to the active module's
|
||||||
* init_routes API
|
* init_routes API
|
||||||
*/
|
*/
|
||||||
if (ORTE_SUCCESS != (rc = orte_routed.init_routes(job, mev->buffer))) {
|
if (ORTE_SUCCESS != (rc = orte_routed.init_routes(job, msgpkt->buffer))) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
}
|
}
|
||||||
OBJ_RELEASE(mev);
|
OBJ_RELEASE(msgpkt);
|
||||||
return;
|
}
|
||||||
|
|
||||||
|
/* reset the event */
|
||||||
|
processing = false;
|
||||||
|
opal_event_add(&ready, 0);
|
||||||
|
|
||||||
|
/* release the thread */
|
||||||
|
OPAL_THREAD_UNLOCK(&lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -151,7 +208,7 @@ void orte_routed_base_recv(int status, orte_process_name_t* sender,
|
|||||||
* buffer, however, is NOT released here, although its payload IS transferred
|
* buffer, however, is NOT released here, although its payload IS transferred
|
||||||
* to the message buffer for later processing
|
* to the message buffer for later processing
|
||||||
*/
|
*/
|
||||||
ORTE_MESSAGE_EVENT(sender, buffer, tag, orte_routed_base_process_msg);
|
ORTE_PROCESS_MESSAGE(&recvs, &lock, processing, ready_fd[1], true, sender, &buffer);
|
||||||
|
|
||||||
/* reissue the recv */
|
/* reissue the recv */
|
||||||
if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
|
if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
|
||||||
@ -163,3 +220,12 @@ void orte_routed_base_recv(int status, orte_process_name_t* sender,
|
|||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* where HNP messages come */
|
||||||
|
void orte_routed_base_process_msg(int fd, short event, void *data)
|
||||||
|
{
|
||||||
|
orte_message_event_t *mev = (orte_message_event_t*)data;
|
||||||
|
|
||||||
|
ORTE_PROCESS_MESSAGE(&recvs, &lock, processing, ready_fd[1], false, &mev->sender, &mev->buffer);
|
||||||
|
OBJ_RELEASE(mev);
|
||||||
|
}
|
||||||
|
@ -27,6 +27,7 @@
|
|||||||
#include "orte/mca/rml/rml.h"
|
#include "orte/mca/rml/rml.h"
|
||||||
#include "orte/runtime/orte_globals.h"
|
#include "orte/runtime/orte_globals.h"
|
||||||
#include "orte/runtime/orte_wait.h"
|
#include "orte/runtime/orte_wait.h"
|
||||||
|
#include "orte/mca/plm/base/base.h"
|
||||||
|
|
||||||
#include "orte/mca/routed/base/base.h"
|
#include "orte/mca/routed/base/base.h"
|
||||||
|
|
||||||
@ -105,3 +106,84 @@ int orte_routed_base_register_sync(bool setup)
|
|||||||
|
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int orte_routed_base_process_callback(orte_jobid_t job, opal_buffer_t *buffer)
|
||||||
|
{
|
||||||
|
orte_proc_t *proc;
|
||||||
|
orte_job_t *jdata;
|
||||||
|
orte_std_cntr_t cnt;
|
||||||
|
char *rml_uri;
|
||||||
|
orte_vpid_t vpid;
|
||||||
|
int rc;
|
||||||
|
|
||||||
|
/* lookup the job object for this process */
|
||||||
|
if (NULL == (jdata = orte_get_job_data_object(job))) {
|
||||||
|
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||||
|
return ORTE_ERR_NOT_FOUND;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* unpack the data for each entry */
|
||||||
|
cnt = 1;
|
||||||
|
while (ORTE_SUCCESS == (rc = opal_dss.unpack(buffer, &vpid, &cnt, ORTE_VPID))) {
|
||||||
|
|
||||||
|
cnt = 1;
|
||||||
|
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &rml_uri, &cnt, OPAL_STRING))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
continue;
|
||||||
|
|
||||||
|
}
|
||||||
|
OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output,
|
||||||
|
"%s routed_binomial:callback got uri %s for job %s rank %s",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
(NULL == rml_uri) ? "NULL" : rml_uri,
|
||||||
|
ORTE_JOBID_PRINT(job), ORTE_VPID_PRINT(vpid)));
|
||||||
|
|
||||||
|
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, vpid))) {
|
||||||
|
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (rml_uri == NULL) {
|
||||||
|
/* if the rml_uri is NULL, then that means this process
|
||||||
|
* terminated without calling orte_init. However, the only
|
||||||
|
* reason we would be getting called here is if other
|
||||||
|
* processes local to that daemon -did- call orte_init.
|
||||||
|
* This is considered an "abnormal termination" mode per
|
||||||
|
* community discussion, and must generate a corresponding
|
||||||
|
* response, so declare the proc abnormally terminated
|
||||||
|
*/
|
||||||
|
proc->state = ORTE_PROC_STATE_TERM_WO_SYNC;
|
||||||
|
/* increment the number of procs that have terminated */
|
||||||
|
jdata->num_terminated++;
|
||||||
|
/* let the normal code path declare the job aborted */
|
||||||
|
orte_plm_base_check_job_completed(jdata);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* update the record */
|
||||||
|
proc->rml_uri = strdup(rml_uri);
|
||||||
|
free(rml_uri);
|
||||||
|
|
||||||
|
/* update the proc state */
|
||||||
|
if (proc->state < ORTE_PROC_STATE_RUNNING) {
|
||||||
|
proc->state = ORTE_PROC_STATE_RUNNING;
|
||||||
|
}
|
||||||
|
|
||||||
|
++jdata->num_reported;
|
||||||
|
cnt = 1;
|
||||||
|
}
|
||||||
|
if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* if all procs have reported, update our job state */
|
||||||
|
if (jdata->num_reported == jdata->num_procs) {
|
||||||
|
/* update the job state */
|
||||||
|
if (jdata->state < ORTE_JOB_STATE_RUNNING) {
|
||||||
|
jdata->state = ORTE_JOB_STATE_RUNNING;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return ORTE_SUCCESS;
|
||||||
|
}
|
||||||
|
@ -421,71 +421,6 @@ static orte_process_name_t get_route(orte_process_name_t *target)
|
|||||||
return *ret;
|
return *ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int process_callback(orte_jobid_t job, opal_buffer_t *buffer)
|
|
||||||
{
|
|
||||||
orte_proc_t **procs;
|
|
||||||
orte_job_t *jdata;
|
|
||||||
orte_std_cntr_t cnt;
|
|
||||||
char *rml_uri;
|
|
||||||
orte_process_name_t name;
|
|
||||||
int rc;
|
|
||||||
|
|
||||||
/* lookup the job object for this process */
|
|
||||||
if (NULL == (jdata = orte_get_job_data_object(job))) {
|
|
||||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
|
||||||
return ORTE_ERR_NOT_FOUND;
|
|
||||||
}
|
|
||||||
procs = (orte_proc_t**)jdata->procs->addr;
|
|
||||||
|
|
||||||
/* unpack the data for each entry */
|
|
||||||
cnt = 1;
|
|
||||||
while (ORTE_SUCCESS == (rc = opal_dss.unpack(buffer, &rml_uri, &cnt, OPAL_STRING))) {
|
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output,
|
|
||||||
"%s routed_binomial:callback got uri %s",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
||||||
(NULL == rml_uri) ? "NULL" : rml_uri));
|
|
||||||
|
|
||||||
if (rml_uri == NULL) continue;
|
|
||||||
|
|
||||||
/* we don't need to set the contact info into our rml
|
|
||||||
* hash table as we won't talk to the proc directly
|
|
||||||
*/
|
|
||||||
|
|
||||||
/* extract the proc's name */
|
|
||||||
if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(rml_uri, &name, NULL))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
free(rml_uri);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
/* the procs are stored in vpid order, so update the record */
|
|
||||||
procs[name.vpid]->rml_uri = strdup(rml_uri);
|
|
||||||
free(rml_uri);
|
|
||||||
|
|
||||||
/* update the proc state */
|
|
||||||
if (procs[name.vpid]->state < ORTE_PROC_STATE_RUNNING) {
|
|
||||||
procs[name.vpid]->state = ORTE_PROC_STATE_RUNNING;
|
|
||||||
}
|
|
||||||
|
|
||||||
++jdata->num_reported;
|
|
||||||
cnt = 1;
|
|
||||||
}
|
|
||||||
if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
return rc;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* if all procs have reported, update our job state */
|
|
||||||
if (jdata->num_reported == jdata->num_procs) {
|
|
||||||
/* update the job state */
|
|
||||||
if (jdata->state < ORTE_JOB_STATE_RUNNING) {
|
|
||||||
jdata->state = ORTE_JOB_STATE_RUNNING;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return ORTE_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* HANDLE ACK MESSAGES FROM AN HNP */
|
/* HANDLE ACK MESSAGES FROM AN HNP */
|
||||||
static void release_ack(int fd, short event, void *data)
|
static void release_ack(int fd, short event, void *data)
|
||||||
{
|
{
|
||||||
@ -621,7 +556,7 @@ static int init_routes(orte_jobid_t job, opal_buffer_t *ndat)
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
/* if not, then I need to process the callback */
|
/* if not, then I need to process the callback */
|
||||||
if (ORTE_SUCCESS != (rc = process_callback(job, ndat))) {
|
if (ORTE_SUCCESS != (rc = orte_routed_base_process_callback(job, ndat))) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
@ -360,71 +360,6 @@ static orte_process_name_t get_route(orte_process_name_t *target)
|
|||||||
return *ret;
|
return *ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int process_callback(orte_jobid_t job, opal_buffer_t *buffer)
|
|
||||||
{
|
|
||||||
orte_proc_t **procs;
|
|
||||||
orte_job_t *jdata;
|
|
||||||
orte_std_cntr_t cnt;
|
|
||||||
char *rml_uri;
|
|
||||||
orte_process_name_t name;
|
|
||||||
int rc;
|
|
||||||
|
|
||||||
/* lookup the job object for this process */
|
|
||||||
if (NULL == (jdata = orte_get_job_data_object(job))) {
|
|
||||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
|
||||||
return ORTE_ERR_NOT_FOUND;
|
|
||||||
}
|
|
||||||
procs = (orte_proc_t**)jdata->procs->addr;
|
|
||||||
|
|
||||||
/* unpack the data for each entry */
|
|
||||||
cnt = 1;
|
|
||||||
while (ORTE_SUCCESS == (rc = opal_dss.unpack(buffer, &rml_uri, &cnt, OPAL_STRING))) {
|
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output,
|
|
||||||
"%s routed_cm:callback got uri %s",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
||||||
(NULL == rml_uri) ? "NULL" : rml_uri));
|
|
||||||
|
|
||||||
if (rml_uri == NULL) continue;
|
|
||||||
|
|
||||||
/* we don't need to set the contact info into our rml
|
|
||||||
* hash table as we won't talk to the proc directly
|
|
||||||
*/
|
|
||||||
|
|
||||||
/* extract the proc's name */
|
|
||||||
if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(rml_uri, &name, NULL))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
free(rml_uri);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
/* the procs are stored in vpid order, so update the record */
|
|
||||||
procs[name.vpid]->rml_uri = strdup(rml_uri);
|
|
||||||
free(rml_uri);
|
|
||||||
|
|
||||||
/* update the proc state */
|
|
||||||
if (procs[name.vpid]->state < ORTE_PROC_STATE_RUNNING) {
|
|
||||||
procs[name.vpid]->state = ORTE_PROC_STATE_RUNNING;
|
|
||||||
}
|
|
||||||
|
|
||||||
++jdata->num_reported;
|
|
||||||
cnt = 1;
|
|
||||||
}
|
|
||||||
if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
return rc;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* if all procs have reported, update our job state */
|
|
||||||
if (jdata->num_reported == jdata->num_procs) {
|
|
||||||
/* update the job state */
|
|
||||||
if (jdata->state < ORTE_JOB_STATE_RUNNING) {
|
|
||||||
jdata->state = ORTE_JOB_STATE_RUNNING;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return ORTE_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* HANDLE ACK MESSAGES FROM AN HNP */
|
/* HANDLE ACK MESSAGES FROM AN HNP */
|
||||||
static void release_ack(int fd, short event, void *data)
|
static void release_ack(int fd, short event, void *data)
|
||||||
{
|
{
|
||||||
@ -571,7 +506,7 @@ static int init_routes(orte_jobid_t job, opal_buffer_t *ndat)
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
/* if not, then I need to process the callback */
|
/* if not, then I need to process the callback */
|
||||||
if (ORTE_SUCCESS != (rc = process_callback(job, ndat))) {
|
if (ORTE_SUCCESS != (rc = orte_routed_base_process_callback(job, ndat))) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
@ -383,71 +383,6 @@ static orte_process_name_t get_route(orte_process_name_t *target)
|
|||||||
return *ret;
|
return *ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int process_callback(orte_jobid_t job, opal_buffer_t *buffer)
|
|
||||||
{
|
|
||||||
orte_proc_t **procs;
|
|
||||||
orte_job_t *jdata;
|
|
||||||
orte_std_cntr_t cnt;
|
|
||||||
char *rml_uri;
|
|
||||||
orte_process_name_t name;
|
|
||||||
int rc;
|
|
||||||
|
|
||||||
/* lookup the job object for this process */
|
|
||||||
if (NULL == (jdata = orte_get_job_data_object(job))) {
|
|
||||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
|
||||||
return ORTE_ERR_NOT_FOUND;
|
|
||||||
}
|
|
||||||
procs = (orte_proc_t**)jdata->procs->addr;
|
|
||||||
|
|
||||||
/* unpack the data for each entry */
|
|
||||||
cnt = 1;
|
|
||||||
while (ORTE_SUCCESS == (rc = opal_dss.unpack(buffer, &rml_uri, &cnt, OPAL_STRING))) {
|
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output,
|
|
||||||
"%s routed_linear:callback got uri %s",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
||||||
(NULL == rml_uri) ? "NULL" : rml_uri));
|
|
||||||
|
|
||||||
if (rml_uri == NULL) continue;
|
|
||||||
|
|
||||||
/* we don't need to set the contact info into our rml
|
|
||||||
* hash table as we won't talk to the proc directly
|
|
||||||
*/
|
|
||||||
|
|
||||||
/* extract the proc's name */
|
|
||||||
if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(rml_uri, &name, NULL))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
free(rml_uri);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
/* the procs are stored in vpid order, so update the record */
|
|
||||||
procs[name.vpid]->rml_uri = strdup(rml_uri);
|
|
||||||
free(rml_uri);
|
|
||||||
|
|
||||||
/* update the proc state */
|
|
||||||
if (procs[name.vpid]->state < ORTE_PROC_STATE_RUNNING) {
|
|
||||||
procs[name.vpid]->state = ORTE_PROC_STATE_RUNNING;
|
|
||||||
}
|
|
||||||
|
|
||||||
++jdata->num_reported;
|
|
||||||
cnt = 1;
|
|
||||||
}
|
|
||||||
if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
return rc;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* if all procs have reported, update our job state */
|
|
||||||
if (jdata->num_reported == jdata->num_procs) {
|
|
||||||
/* update the job state */
|
|
||||||
if (jdata->state < ORTE_JOB_STATE_RUNNING) {
|
|
||||||
jdata->state = ORTE_JOB_STATE_RUNNING;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return ORTE_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* HANDLE ACK MESSAGES FROM AN HNP */
|
/* HANDLE ACK MESSAGES FROM AN HNP */
|
||||||
static void release_ack(int fd, short event, void *data)
|
static void release_ack(int fd, short event, void *data)
|
||||||
{
|
{
|
||||||
@ -578,7 +513,7 @@ static int init_routes(orte_jobid_t job, opal_buffer_t *ndat)
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
/* if not, then I need to process the callback */
|
/* if not, then I need to process the callback */
|
||||||
if (ORTE_SUCCESS != (rc = process_callback(job, ndat))) {
|
if (ORTE_SUCCESS != (rc = orte_routed_base_process_callback(job, ndat))) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
@ -415,71 +415,6 @@ found:
|
|||||||
return *ret;
|
return *ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int process_callback(orte_jobid_t job, opal_buffer_t *buffer)
|
|
||||||
{
|
|
||||||
orte_proc_t **procs;
|
|
||||||
orte_job_t *jdata;
|
|
||||||
orte_std_cntr_t cnt;
|
|
||||||
char *rml_uri;
|
|
||||||
orte_process_name_t name;
|
|
||||||
int rc;
|
|
||||||
|
|
||||||
/* lookup the job object for this process */
|
|
||||||
if (NULL == (jdata = orte_get_job_data_object(job))) {
|
|
||||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
|
||||||
return ORTE_ERR_NOT_FOUND;
|
|
||||||
}
|
|
||||||
procs = (orte_proc_t**)jdata->procs->addr;
|
|
||||||
|
|
||||||
/* unpack the data for each entry */
|
|
||||||
cnt = 1;
|
|
||||||
while (ORTE_SUCCESS == (rc = opal_dss.unpack(buffer, &rml_uri, &cnt, OPAL_STRING))) {
|
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output,
|
|
||||||
"%s routed_radix:callback got uri %s",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
||||||
(NULL == rml_uri) ? "NULL" : rml_uri));
|
|
||||||
|
|
||||||
if (rml_uri == NULL) continue;
|
|
||||||
|
|
||||||
/* we don't need to set the contact info into our rml
|
|
||||||
* hash table as we won't talk to the proc directly
|
|
||||||
*/
|
|
||||||
|
|
||||||
/* extract the proc's name */
|
|
||||||
if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(rml_uri, &name, NULL))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
free(rml_uri);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
/* the procs are stored in vpid order, so update the record */
|
|
||||||
procs[name.vpid]->rml_uri = strdup(rml_uri);
|
|
||||||
free(rml_uri);
|
|
||||||
|
|
||||||
/* update the proc state */
|
|
||||||
if (procs[name.vpid]->state < ORTE_PROC_STATE_RUNNING) {
|
|
||||||
procs[name.vpid]->state = ORTE_PROC_STATE_RUNNING;
|
|
||||||
}
|
|
||||||
|
|
||||||
++jdata->num_reported;
|
|
||||||
cnt = 1;
|
|
||||||
}
|
|
||||||
if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
return rc;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* if all procs have reported, update our job state */
|
|
||||||
if (jdata->num_reported == jdata->num_procs) {
|
|
||||||
/* update the job state */
|
|
||||||
if (jdata->state < ORTE_JOB_STATE_RUNNING) {
|
|
||||||
jdata->state = ORTE_JOB_STATE_RUNNING;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return ORTE_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* HANDLE ACK MESSAGES FROM AN HNP */
|
/* HANDLE ACK MESSAGES FROM AN HNP */
|
||||||
static void release_ack(int fd, short event, void *data)
|
static void release_ack(int fd, short event, void *data)
|
||||||
{
|
{
|
||||||
@ -610,7 +545,7 @@ static int init_routes(orte_jobid_t job, opal_buffer_t *ndat)
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
/* if not, then I need to process the callback */
|
/* if not, then I need to process the callback */
|
||||||
if (ORTE_SUCCESS != (rc = process_callback(job, ndat))) {
|
if (ORTE_SUCCESS != (rc = orte_routed_base_process_callback(job, ndat))) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
@ -95,8 +95,18 @@ in the application to be terminated by signals sent by %s
|
|||||||
#
|
#
|
||||||
[orterun:proc-exit-no-sync]
|
[orterun:proc-exit-no-sync]
|
||||||
%s has exited due to process rank %lu with PID %lu on
|
%s has exited due to process rank %lu with PID %lu on
|
||||||
node %s exiting without calling "finalize". This may
|
node %s exiting improperly. There are two reasons this could occur:
|
||||||
have caused other processes in the application to be
|
|
||||||
|
1. this process did not call "init" before exiting, but others in
|
||||||
|
the job did. This can cause a job to hang indefinitely while it waits
|
||||||
|
for all processes to call "init". By rule, if one process calls "init",
|
||||||
|
then ALL processes must call "init" prior to termination.
|
||||||
|
|
||||||
|
2. this process called "init", but exited without calling "finalize".
|
||||||
|
By rule, all processes that call "init" MUST call "finalize" prior to
|
||||||
|
exiting or it will be considered an "abnormal termination"
|
||||||
|
|
||||||
|
This may have caused other processes in the application to be
|
||||||
terminated by signals sent by %s (as reported here).
|
terminated by signals sent by %s (as reported here).
|
||||||
#
|
#
|
||||||
[orterun:proc-exit-no-sync-unknown]
|
[orterun:proc-exit-no-sync-unknown]
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user