1
1

Manage errors in communicator creations (cid)

In order for this to work, error management needs to also be added to
NBC, from separate PR

Signed-off-by: Aurelien Bouteiller <bouteill@icl.utk.edu>

The error field of requests needs to be rearmed at start, not at create

Signed-off-by: Aurelien Bouteiller <bouteill@icl.utk.edu>
Этот коммит содержится в:
Aurelien Bouteiller 2018-01-26 10:11:21 -05:00
родитель 1b96be5f2f
Коммит 96c91e94eb
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 08F60797C5941DB2
6 изменённых файлов: 62 добавлений и 29 удалений

Просмотреть файл

@ -120,10 +120,10 @@ int ompi_comm_set ( ompi_communicator_t **ncomm,
}
if (NULL != req) {
ompi_request_wait( &req, MPI_STATUS_IGNORE);
rc = ompi_request_wait( &req, MPI_STATUS_IGNORE);
}
return OMPI_SUCCESS;
return rc;
}
/*
@ -1006,6 +1006,7 @@ int ompi_comm_dup_with_info ( ompi_communicator_t * comm, opal_info_t *info, omp
/* Determine context id. It is identical to f_2_c_handle */
rc = ompi_comm_nextcid (newcomp, comm, NULL, NULL, NULL, false, mode);
if ( OMPI_SUCCESS != rc ) {
OBJ_RELEASE(newcomp);
return rc;
}
@ -1022,6 +1023,7 @@ int ompi_comm_dup_with_info ( ompi_communicator_t * comm, opal_info_t *info, omp
/* activate communicator and init coll-module */
rc = ompi_comm_activate (&newcomp, comm, NULL, NULL, NULL, false, mode);
if ( OMPI_SUCCESS != rc ) {
OBJ_RELEASE(newcomp);
return rc;
}
@ -1138,6 +1140,7 @@ static int ompi_comm_idup_getcid (ompi_comm_request_t *request)
NULL, false, mode, subreq);
if (OMPI_SUCCESS != rc) {
ompi_comm_request_return (request);
OBJ_RELEASE(context->newcomp);
return rc;
}
@ -1166,6 +1169,7 @@ static int ompi_comm_idup_with_info_activate (ompi_comm_request_t *request)
/* activate communicator and init coll-module */
rc = ompi_comm_activate_nb (&context->newcomp, context->comm, NULL, NULL, NULL, false, mode, subreq);
if ( OMPI_SUCCESS != rc ) {
OBJ_RELEASE(context->newcomp);
return rc;
}
@ -1208,6 +1212,7 @@ int ompi_comm_create_group (ompi_communicator_t *comm, ompi_group_t *group, int
/* Determine context id. It is identical to f_2_c_handle */
rc = ompi_comm_nextcid (newcomp, comm, NULL, &tag, NULL, false, mode);
if ( OMPI_SUCCESS != rc ) {
OBJ_RELEASE(newcomp);
return rc;
}
@ -1218,6 +1223,7 @@ int ompi_comm_create_group (ompi_communicator_t *comm, ompi_group_t *group, int
/* activate communicator and init coll-module */
rc = ompi_comm_activate (&newcomp, comm, NULL, &tag, NULL, false, mode);
if ( OMPI_SUCCESS != rc ) {
OBJ_RELEASE(newcomp);
return rc;
}
@ -1517,16 +1523,16 @@ int ompi_comm_free( ompi_communicator_t **comm )
/**********************************************************************/
/**********************************************************************/
/**********************************************************************/
ompi_proc_t **ompi_comm_get_rprocs ( ompi_communicator_t *local_comm,
ompi_communicator_t *bridge_comm,
int local_leader,
int remote_leader,
int tag,
int rsize)
int ompi_comm_get_rprocs ( ompi_communicator_t *local_comm,
ompi_communicator_t *bridge_comm,
int local_leader,
int remote_leader,
int tag,
int rsize,
ompi_proc_t ***prprocs )
{
MPI_Request req;
int rc;
int rc = OMPI_SUCCESS;
int local_rank, local_size;
ompi_proc_t **rprocs=NULL;
int32_t size_len;
@ -1543,7 +1549,7 @@ ompi_proc_t **ompi_comm_get_rprocs ( ompi_communicator_t *local_comm,
if (local_rank == local_leader) {
sbuf = OBJ_NEW(opal_buffer_t);
if (NULL == sbuf) {
rc = OMPI_ERROR;
rc = OMPI_ERR_OUT_OF_RESOURCE;
goto err_exit;
}
if(OMPI_GROUP_IS_DENSE(local_comm->c_local_group)) {
@ -1595,6 +1601,7 @@ ompi_proc_t **ompi_comm_get_rprocs ( ompi_communicator_t *local_comm,
/* Allocate temporary buffer */
recvbuf = (char *)malloc(rlen);
if ( NULL == recvbuf ) {
rc = OMPI_ERR_OUT_OF_RESOURCE;
goto err_exit;
}
@ -1626,7 +1633,7 @@ ompi_proc_t **ompi_comm_get_rprocs ( ompi_communicator_t *local_comm,
rbuf = OBJ_NEW(opal_buffer_t);
if (NULL == rbuf) {
rc = OMPI_ERROR;
rc = OMPI_ERR_OUT_OF_RESOURCE;
goto err_exit;
}
@ -1634,11 +1641,12 @@ ompi_proc_t **ompi_comm_get_rprocs ( ompi_communicator_t *local_comm,
goto err_exit;
}
/* decode the names into a proc-list */
/* decode the names into a proc-list -- will never add a new proc
as the result of this operation, so no need to get the newprocs
list or call PML add_procs(). */
rc = ompi_proc_unpack(rbuf, rsize, &rprocs, NULL, NULL);
OBJ_RELEASE(rbuf);
if (OMPI_SUCCESS != rc) {
OMPI_ERROR_LOG(rc);
goto err_exit;
}
@ -1658,7 +1666,6 @@ ompi_proc_t **ompi_comm_get_rprocs ( ompi_communicator_t *local_comm,
/* And now add the information into the database */
if (OMPI_SUCCESS != (rc = MCA_PML_CALL(add_procs(rprocs, rsize)))) {
OMPI_ERROR_LOG(rc);
goto err_exit;
}
@ -1666,6 +1673,7 @@ ompi_proc_t **ompi_comm_get_rprocs ( ompi_communicator_t *local_comm,
/* rprocs isn't freed unless we have an error,
since it is used in the communicator */
if ( OMPI_SUCCESS != rc ) {
OMPI_ERROR_LOG(rc);
opal_output(0, "%d: Error in ompi_get_rprocs\n", local_rank);
if ( NULL != rprocs ) {
free ( rprocs );
@ -1686,7 +1694,8 @@ ompi_proc_t **ompi_comm_get_rprocs ( ompi_communicator_t *local_comm,
free ( sendbuf );
}
return rprocs;
*prprocs = rprocs;
return rc;
}
/**********************************************************************/
/**********************************************************************/

Просмотреть файл

@ -370,6 +370,13 @@ static int ompi_comm_checkcid (ompi_comm_request_t *request)
int ret;
int participate = (context->newcomm->c_local_group->grp_my_rank != MPI_UNDEFINED);
if (OMPI_SUCCESS != request->super.req_status.MPI_ERROR) {
if (participate) {
opal_pointer_array_set_item(&ompi_mpi_communicators, context->nextlocal_cid, NULL);
}
return request->super.req_status.MPI_ERROR;
}
if (OPAL_THREAD_TRYLOCK(&ompi_cid_lock)) {
return ompi_comm_request_schedule_append (request, ompi_comm_checkcid, NULL, 0);
}
@ -407,11 +414,18 @@ static int ompi_comm_nextcid_check_flag (ompi_comm_request_t *request)
ompi_comm_cid_context_t *context = (ompi_comm_cid_context_t *) request->context;
int participate = (context->newcomm->c_local_group->grp_my_rank != MPI_UNDEFINED);
if (OMPI_SUCCESS != request->super.req_status.MPI_ERROR) {
if (participate) {
opal_pointer_array_set_item(&ompi_mpi_communicators, context->nextcid, NULL);
}
return request->super.req_status.MPI_ERROR;
}
if (OPAL_THREAD_TRYLOCK(&ompi_cid_lock)) {
return ompi_comm_request_schedule_append (request, ompi_comm_nextcid_check_flag, NULL, 0);
}
if (1 == context->rflag) {
if (0 != context->rflag) {
if( !participate ) {
/* we need to provide something sane here
* but we cannot use `nextcid` as we may have it
@ -442,7 +456,7 @@ static int ompi_comm_nextcid_check_flag (ompi_comm_request_t *request)
return OMPI_SUCCESS;
}
if (participate && (1 == context->flag)) {
if (participate && (0 != context->flag)) {
/* we could use this cid, but other don't agree */
opal_pointer_array_set_item (&ompi_mpi_communicators, context->nextcid, NULL);
context->start = context->nextcid + 1; /* that's where we can start the next round */

Просмотреть файл

@ -119,6 +119,11 @@ static int ompi_comm_request_progress (void)
while (request_item->subreq_count) {
ompi_request_t *subreq = request_item->subreqs[request_item->subreq_count-1];
if( REQUEST_COMPLETE(subreq) ) {
if (OMPI_SUCCESS != subreq->req_status.MPI_ERROR) {
/* Let it continue but mark it as failed, so
* that it does some subreqs cleanup */
request->super.req_status.MPI_ERROR = subreq->req_status.MPI_ERROR;
}
ompi_request_free (&subreq);
request_item->subreq_count--;
} else {
@ -130,6 +135,8 @@ static int ompi_comm_request_progress (void)
if (item_complete) {
if (request_item->callback) {
opal_mutex_unlock (&ompi_comm_request_mutex);
/* the callback should check for errors in the request
* status. */
rc = request_item->callback (request);
opal_mutex_lock (&ompi_comm_request_mutex);
}
@ -142,7 +149,7 @@ static int ompi_comm_request_progress (void)
/* if the request schedule is empty then the request is complete */
if (0 == opal_list_get_size (&request->schedule)) {
opal_list_remove_item (&ompi_comm_requests_active, (opal_list_item_t *) request);
request->super.req_status.MPI_ERROR = (OMPI_SUCCESS == rc) ? MPI_SUCCESS : MPI_ERR_INTERN;
request->super.req_status.MPI_ERROR = (OMPI_SUCCESS == rc) ? MPI_SUCCESS : rc;
ompi_request_complete (&request->super, true);
}
}
@ -171,6 +178,7 @@ void ompi_comm_request_start (ompi_comm_request_t *request)
}
request->super.req_state = OMPI_REQUEST_ACTIVE;
request->super.req_status.MPI_ERROR = OMPI_SUCCESS;
opal_mutex_unlock (&ompi_comm_request_mutex);
}

Просмотреть файл

@ -649,12 +649,13 @@ OMPI_DECLSPEC int ompi_comm_set_nb ( ompi_communicator_t **ncomm,
* The routine makes sure, that all processes have afterwards
* a list of ompi_proc_t pointers for the remote group.
*/
struct ompi_proc_t **ompi_comm_get_rprocs ( ompi_communicator_t *local_comm,
ompi_communicator_t *bridge_comm,
int local_leader,
int remote_leader,
int tag,
int rsize);
int ompi_comm_get_rprocs ( ompi_communicator_t *local_comm,
ompi_communicator_t *bridge_comm,
int local_leader,
int remote_leader,
int tag,
int rsize,
struct ompi_proc_t ***prprocs );
/**
* This routine verifies, whether local_group and remote group are overlapping

Просмотреть файл

@ -160,6 +160,7 @@ int ompi_errhandler_request_invoke(int count,
/* Invoke the exception */
switch (type) {
case OMPI_REQUEST_PML:
case OMPI_REQUEST_COLL:
return ompi_errhandler_invoke(mpi_object.comm->error_handler,
mpi_object.comm,
mpi_object.comm->errhandler_type,

Просмотреть файл

@ -137,9 +137,9 @@ int MPI_Intercomm_create(MPI_Comm local_comm, int local_leader,
goto err_exit;
}
rprocs = ompi_comm_get_rprocs( local_comm, bridge_comm, lleader,
remote_leader, tag, rsize );
if ( NULL == rprocs ) {
rc = ompi_comm_get_rprocs( local_comm, bridge_comm, lleader,
remote_leader, tag, rsize, &rprocs );
if ( OMPI_SUCCESS != rc ) {
goto err_exit;
}
@ -222,7 +222,7 @@ int MPI_Intercomm_create(MPI_Comm local_comm, int local_leader,
}
if ( OMPI_SUCCESS != rc ) {
*newintercomm = MPI_COMM_NULL;
return OMPI_ERRHANDLER_INVOKE(local_comm, MPI_ERR_INTERN,
return OMPI_ERRHANDLER_INVOKE(local_comm, rc,
FUNC_NAME);
}