SNAPC: use ORTE_WAIT_FOR_COMPLETION with non-blocking receives
During the commits to make the C/R code compile again the blocking receive calls in snapc_full_app.c were replaced by non-blocking receive calls. This commit adds ORTE_WAIT_FOR_COMPLETION() after each non-blocking receive to wait for the data. This commit was SVN r30487.
Этот коммит содержится в:
родитель
fa1036f38c
Коммит
5f95db3902
@ -99,12 +99,6 @@ static int current_cr_state = OPAL_CRS_NONE;
|
|||||||
static orte_sstore_base_handle_t current_ss_handle = ORTE_SSTORE_HANDLE_INVALID, last_ss_handle = ORTE_SSTORE_HANDLE_INVALID;
|
static orte_sstore_base_handle_t current_ss_handle = ORTE_SSTORE_HANDLE_INVALID, last_ss_handle = ORTE_SSTORE_HANDLE_INVALID;
|
||||||
static opal_crs_base_ckpt_options_t *current_options = NULL;
|
static opal_crs_base_ckpt_options_t *current_options = NULL;
|
||||||
|
|
||||||
static void snapc_full_app_callback_recv(int status,
|
|
||||||
orte_process_name_t* sender,
|
|
||||||
opal_buffer_t* buffer,
|
|
||||||
orte_rml_tag_t tag,
|
|
||||||
void* cbdata);
|
|
||||||
|
|
||||||
/************************
|
/************************
|
||||||
* Function Definitions
|
* Function Definitions
|
||||||
************************/
|
************************/
|
||||||
@ -224,6 +218,7 @@ int app_coord_finalize()
|
|||||||
opal_buffer_t *buffer = NULL;
|
opal_buffer_t *buffer = NULL;
|
||||||
orte_std_cntr_t count;
|
orte_std_cntr_t count;
|
||||||
orte_grpcomm_collective_t *coll;
|
orte_grpcomm_collective_t *coll;
|
||||||
|
orte_rml_recv_cb_t *rb = NULL;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* All processes must sync here, so the Global coordinator can know that
|
* All processes must sync here, so the Global coordinator can know that
|
||||||
@ -281,6 +276,9 @@ int app_coord_finalize()
|
|||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* buffer should not be released here; the callback releases it */
|
||||||
|
buffer = NULL;
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((3, mca_snapc_full_component.super.output_handle,
|
OPAL_OUTPUT_VERBOSE((3, mca_snapc_full_component.super.output_handle,
|
||||||
"app) Shutdown Barrier: Waiting on FIN_ACK...!"));
|
"app) Shutdown Barrier: Waiting on FIN_ACK...!"));
|
||||||
|
|
||||||
@ -288,28 +286,20 @@ int app_coord_finalize()
|
|||||||
* We could have been checkpointing just as we entered finalize, so we
|
* We could have been checkpointing just as we entered finalize, so we
|
||||||
* need to wait until the checkpoint is finished before finishing.
|
* need to wait until the checkpoint is finished before finishing.
|
||||||
*/
|
*/
|
||||||
buffer = OBJ_NEW(opal_buffer_t);
|
rb = OBJ_NEW(orte_rml_recv_cb_t);
|
||||||
#ifdef ENABLE_FT_FIXED
|
rb->active = true;
|
||||||
/* This is the old, now broken code */
|
orte_rml.recv_buffer_nb(ORTE_PROC_MY_HNP, ORTE_RML_TAG_SNAPC_FULL, 0, orte_rml_recv_callback, rb);
|
||||||
if (0 > (ret = orte_rml.recv_buffer(ORTE_PROC_MY_HNP, &buffer, ORTE_RML_TAG_SNAPC_FULL, 0))) {
|
ORTE_WAIT_FOR_COMPLETION(rb->active);
|
||||||
ORTE_ERROR_LOG(ret);
|
|
||||||
exit_status = ret;
|
|
||||||
OBJ_DESTRUCT(&buffer);
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
#endif /* ENABLE_FT_FIXED */
|
|
||||||
orte_rml.recv_buffer_nb(ORTE_PROC_MY_HNP, ORTE_RML_TAG_SNAPC_FULL, 0, snapc_full_app_callback_recv, buffer);
|
|
||||||
|
|
||||||
/* wait for completion */
|
|
||||||
count = 1;
|
count = 1;
|
||||||
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &command, &count, ORTE_SNAPC_FULL_CMD))) {
|
if (ORTE_SUCCESS != (ret = opal_dss.unpack(&rb->data, &command, &count, ORTE_SNAPC_FULL_CMD))) {
|
||||||
ORTE_ERROR_LOG(ret);
|
ORTE_ERROR_LOG(ret);
|
||||||
exit_status = ret;
|
exit_status = ret;
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
|
|
||||||
count = 1;
|
count = 1;
|
||||||
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &op_event, &count, OPAL_INT))) {
|
if (ORTE_SUCCESS != (ret = opal_dss.unpack(&rb->data, &op_event, &count, OPAL_INT))) {
|
||||||
ORTE_ERROR_LOG(ret);
|
ORTE_ERROR_LOG(ret);
|
||||||
exit_status = ret;
|
exit_status = ret;
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
@ -337,6 +327,10 @@ int app_coord_finalize()
|
|||||||
OBJ_RELEASE(buffer);
|
OBJ_RELEASE(buffer);
|
||||||
buffer = NULL;
|
buffer = NULL;
|
||||||
}
|
}
|
||||||
|
if (NULL != rb) {
|
||||||
|
OBJ_RELEASE(rb);
|
||||||
|
rb = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
OBJ_RELEASE(coll);
|
OBJ_RELEASE(coll);
|
||||||
|
|
||||||
@ -1301,6 +1295,7 @@ int app_coord_request_op(orte_snapc_base_request_op_t *datum)
|
|||||||
orte_snapc_full_cmd_flag_t command = ORTE_SNAPC_FULL_REQUEST_OP_CMD;
|
orte_snapc_full_cmd_flag_t command = ORTE_SNAPC_FULL_REQUEST_OP_CMD;
|
||||||
opal_buffer_t *buffer = NULL;
|
opal_buffer_t *buffer = NULL;
|
||||||
orte_std_cntr_t count;
|
orte_std_cntr_t count;
|
||||||
|
orte_rml_recv_cb_t *rb = NULL;
|
||||||
int op_event, op_state;
|
int op_event, op_state;
|
||||||
char *seq_str = NULL, *tmp_str = NULL;
|
char *seq_str = NULL, *tmp_str = NULL;
|
||||||
int cr_state = OPAL_CRS_CONTINUE;
|
int cr_state = OPAL_CRS_CONTINUE;
|
||||||
@ -1535,39 +1530,30 @@ int app_coord_request_op(orte_snapc_base_request_op_t *datum)
|
|||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
|
|
||||||
buffer = OBJ_NEW(opal_buffer_t);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Wait for a response regarding completion
|
* Wait for a response regarding completion
|
||||||
*/
|
*/
|
||||||
#ifdef ENABLE_FT_FIXED
|
rb = OBJ_NEW(orte_rml_recv_cb_t);
|
||||||
/* This is the old, now broken code */
|
rb->active = true;
|
||||||
if (0 > (ret = orte_rml.recv_buffer(ORTE_PROC_MY_HNP, &buffer, ORTE_RML_TAG_SNAPC_FULL, 0))) {
|
orte_rml.recv_buffer_nb(ORTE_PROC_MY_HNP, ORTE_RML_TAG_SNAPC_FULL, 0, orte_rml_recv_callback, rb);
|
||||||
ORTE_ERROR_LOG(ret);
|
ORTE_WAIT_FOR_COMPLETION(rb->active);
|
||||||
exit_status = ret;
|
|
||||||
OBJ_DESTRUCT(&buffer);
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
#endif /* ENABLE_FT_FIXED */
|
|
||||||
orte_rml.recv_buffer_nb(ORTE_PROC_MY_HNP, ORTE_RML_TAG_SNAPC_FULL, 0, snapc_full_app_callback_recv, buffer);
|
|
||||||
/* wait for completion */
|
|
||||||
|
|
||||||
count = 1;
|
count = 1;
|
||||||
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &command, &count, ORTE_SNAPC_FULL_CMD))) {
|
if (ORTE_SUCCESS != (ret = opal_dss.unpack(&rb->data, &command, &count, ORTE_SNAPC_FULL_CMD))) {
|
||||||
ORTE_ERROR_LOG(ret);
|
ORTE_ERROR_LOG(ret);
|
||||||
exit_status = ret;
|
exit_status = ret;
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
|
|
||||||
count = 1;
|
count = 1;
|
||||||
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &op_event, &count, OPAL_INT))) {
|
if (ORTE_SUCCESS != (ret = opal_dss.unpack(&rb->data, &op_event, &count, OPAL_INT))) {
|
||||||
ORTE_ERROR_LOG(ret);
|
ORTE_ERROR_LOG(ret);
|
||||||
exit_status = ret;
|
exit_status = ret;
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
|
|
||||||
count = 1;
|
count = 1;
|
||||||
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &op_state, &count, OPAL_INT))) {
|
if (ORTE_SUCCESS != (ret = opal_dss.unpack(&rb->data, &op_state, &count, OPAL_INT))) {
|
||||||
ORTE_ERROR_LOG(ret);
|
ORTE_ERROR_LOG(ret);
|
||||||
exit_status = ret;
|
exit_status = ret;
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
@ -1608,39 +1594,31 @@ int app_coord_request_op(orte_snapc_base_request_op_t *datum)
|
|||||||
"App) Request_op: Leader waiting for Migrate release (%3d)...",
|
"App) Request_op: Leader waiting for Migrate release (%3d)...",
|
||||||
datum->event));
|
datum->event));
|
||||||
|
|
||||||
buffer = OBJ_NEW(opal_buffer_t);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Wait for a response regarding completion
|
* Wait for a response regarding completion
|
||||||
*/
|
*/
|
||||||
#ifdef ENABLE_FT_FIXED
|
rb = OBJ_NEW(orte_rml_recv_cb_t);
|
||||||
/* This is the old, now broken code */
|
rb->active = true;
|
||||||
if (0 > (ret = orte_rml.recv_buffer(ORTE_PROC_MY_HNP, &buffer, ORTE_RML_TAG_SNAPC_FULL, 0))) {
|
orte_rml.recv_buffer_nb(ORTE_PROC_MY_HNP, ORTE_RML_TAG_SNAPC_FULL, 0, orte_rml_recv_callback, rb);
|
||||||
ORTE_ERROR_LOG(ret);
|
ORTE_WAIT_FOR_COMPLETION(rb->active);
|
||||||
exit_status = ret;
|
|
||||||
OBJ_DESTRUCT(&buffer);
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
#endif /* ENABLE_FT_FIXED */
|
|
||||||
orte_rml.recv_buffer_nb(ORTE_PROC_MY_HNP, ORTE_RML_TAG_SNAPC_FULL, 0, snapc_full_app_callback_recv, buffer);
|
|
||||||
/* wait for completion */
|
|
||||||
|
|
||||||
count = 1;
|
count = 1;
|
||||||
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &command, &count, ORTE_SNAPC_FULL_CMD))) {
|
if (ORTE_SUCCESS != (ret = opal_dss.unpack(&rb->data, &command, &count, ORTE_SNAPC_FULL_CMD))) {
|
||||||
ORTE_ERROR_LOG(ret);
|
ORTE_ERROR_LOG(ret);
|
||||||
exit_status = ret;
|
exit_status = ret;
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
|
|
||||||
count = 1;
|
count = 1;
|
||||||
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &op_event, &count, OPAL_INT))) {
|
if (ORTE_SUCCESS != (ret = opal_dss.unpack(&rb->data, &op_event, &count, OPAL_INT))) {
|
||||||
ORTE_ERROR_LOG(ret);
|
ORTE_ERROR_LOG(ret);
|
||||||
exit_status = ret;
|
exit_status = ret;
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
|
|
||||||
count = 1;
|
count = 1;
|
||||||
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &op_state, &count, OPAL_INT))) {
|
if (ORTE_SUCCESS != (ret = opal_dss.unpack(&rb->data, &op_state, &count, OPAL_INT))) {
|
||||||
ORTE_ERROR_LOG(ret);
|
ORTE_ERROR_LOG(ret);
|
||||||
exit_status = ret;
|
exit_status = ret;
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
@ -1683,6 +1661,10 @@ int app_coord_request_op(orte_snapc_base_request_op_t *datum)
|
|||||||
OBJ_RELEASE(buffer);
|
OBJ_RELEASE(buffer);
|
||||||
buffer = NULL;
|
buffer = NULL;
|
||||||
}
|
}
|
||||||
|
if (NULL != rb) {
|
||||||
|
OBJ_RELEASE(rb);
|
||||||
|
rb = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
if( NULL != seq_str ) {
|
if( NULL != seq_str ) {
|
||||||
free(seq_str);
|
free(seq_str);
|
||||||
@ -1696,12 +1678,3 @@ int app_coord_request_op(orte_snapc_base_request_op_t *datum)
|
|||||||
|
|
||||||
return exit_status;
|
return exit_status;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* dummy implementation of a callback function to get it to compile again */
|
|
||||||
static void snapc_full_app_callback_recv(int status,
|
|
||||||
orte_process_name_t* sender,
|
|
||||||
opal_buffer_t* buffer,
|
|
||||||
orte_rml_tag_t tag,
|
|
||||||
void* cbdata)
|
|
||||||
{
|
|
||||||
}
|
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user