1
1

Manage errors in communicator creations (cid)

In order for this to work, error management needs to also be added to
NBC, from separate PR

Signed-off-by: Aurelien Bouteiller <bouteill@icl.utk.edu>

The error field of requests needs to be rearmed at start, not at create

Signed-off-by: Aurelien Bouteiller <bouteill@icl.utk.edu>
Этот коммит содержится в:
Aurelien Bouteiller 2018-01-26 10:11:21 -05:00
родитель 1b96be5f2f
Коммит 96c91e94eb
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 08F60797C5941DB2
6 изменённых файлов: 62 добавлений и 29 удалений

Просмотреть файл

@ -120,10 +120,10 @@ int ompi_comm_set ( ompi_communicator_t **ncomm,
} }
if (NULL != req) { if (NULL != req) {
ompi_request_wait( &req, MPI_STATUS_IGNORE); rc = ompi_request_wait( &req, MPI_STATUS_IGNORE);
} }
return OMPI_SUCCESS; return rc;
} }
/* /*
@ -1006,6 +1006,7 @@ int ompi_comm_dup_with_info ( ompi_communicator_t * comm, opal_info_t *info, omp
/* Determine context id. It is identical to f_2_c_handle */ /* Determine context id. It is identical to f_2_c_handle */
rc = ompi_comm_nextcid (newcomp, comm, NULL, NULL, NULL, false, mode); rc = ompi_comm_nextcid (newcomp, comm, NULL, NULL, NULL, false, mode);
if ( OMPI_SUCCESS != rc ) { if ( OMPI_SUCCESS != rc ) {
OBJ_RELEASE(newcomp);
return rc; return rc;
} }
@ -1022,6 +1023,7 @@ int ompi_comm_dup_with_info ( ompi_communicator_t * comm, opal_info_t *info, omp
/* activate communicator and init coll-module */ /* activate communicator and init coll-module */
rc = ompi_comm_activate (&newcomp, comm, NULL, NULL, NULL, false, mode); rc = ompi_comm_activate (&newcomp, comm, NULL, NULL, NULL, false, mode);
if ( OMPI_SUCCESS != rc ) { if ( OMPI_SUCCESS != rc ) {
OBJ_RELEASE(newcomp);
return rc; return rc;
} }
@ -1138,6 +1140,7 @@ static int ompi_comm_idup_getcid (ompi_comm_request_t *request)
NULL, false, mode, subreq); NULL, false, mode, subreq);
if (OMPI_SUCCESS != rc) { if (OMPI_SUCCESS != rc) {
ompi_comm_request_return (request); ompi_comm_request_return (request);
OBJ_RELEASE(context->newcomp);
return rc; return rc;
} }
@ -1166,6 +1169,7 @@ static int ompi_comm_idup_with_info_activate (ompi_comm_request_t *request)
/* activate communicator and init coll-module */ /* activate communicator and init coll-module */
rc = ompi_comm_activate_nb (&context->newcomp, context->comm, NULL, NULL, NULL, false, mode, subreq); rc = ompi_comm_activate_nb (&context->newcomp, context->comm, NULL, NULL, NULL, false, mode, subreq);
if ( OMPI_SUCCESS != rc ) { if ( OMPI_SUCCESS != rc ) {
OBJ_RELEASE(context->newcomp);
return rc; return rc;
} }
@ -1208,6 +1212,7 @@ int ompi_comm_create_group (ompi_communicator_t *comm, ompi_group_t *group, int
/* Determine context id. It is identical to f_2_c_handle */ /* Determine context id. It is identical to f_2_c_handle */
rc = ompi_comm_nextcid (newcomp, comm, NULL, &tag, NULL, false, mode); rc = ompi_comm_nextcid (newcomp, comm, NULL, &tag, NULL, false, mode);
if ( OMPI_SUCCESS != rc ) { if ( OMPI_SUCCESS != rc ) {
OBJ_RELEASE(newcomp);
return rc; return rc;
} }
@ -1218,6 +1223,7 @@ int ompi_comm_create_group (ompi_communicator_t *comm, ompi_group_t *group, int
/* activate communicator and init coll-module */ /* activate communicator and init coll-module */
rc = ompi_comm_activate (&newcomp, comm, NULL, &tag, NULL, false, mode); rc = ompi_comm_activate (&newcomp, comm, NULL, &tag, NULL, false, mode);
if ( OMPI_SUCCESS != rc ) { if ( OMPI_SUCCESS != rc ) {
OBJ_RELEASE(newcomp);
return rc; return rc;
} }
@ -1517,16 +1523,16 @@ int ompi_comm_free( ompi_communicator_t **comm )
/**********************************************************************/ /**********************************************************************/
/**********************************************************************/ /**********************************************************************/
/**********************************************************************/ /**********************************************************************/
ompi_proc_t **ompi_comm_get_rprocs ( ompi_communicator_t *local_comm, int ompi_comm_get_rprocs ( ompi_communicator_t *local_comm,
ompi_communicator_t *bridge_comm, ompi_communicator_t *bridge_comm,
int local_leader, int local_leader,
int remote_leader, int remote_leader,
int tag, int tag,
int rsize) int rsize,
ompi_proc_t ***prprocs )
{ {
MPI_Request req; MPI_Request req;
int rc; int rc = OMPI_SUCCESS;
int local_rank, local_size; int local_rank, local_size;
ompi_proc_t **rprocs=NULL; ompi_proc_t **rprocs=NULL;
int32_t size_len; int32_t size_len;
@ -1543,7 +1549,7 @@ ompi_proc_t **ompi_comm_get_rprocs ( ompi_communicator_t *local_comm,
if (local_rank == local_leader) { if (local_rank == local_leader) {
sbuf = OBJ_NEW(opal_buffer_t); sbuf = OBJ_NEW(opal_buffer_t);
if (NULL == sbuf) { if (NULL == sbuf) {
rc = OMPI_ERROR; rc = OMPI_ERR_OUT_OF_RESOURCE;
goto err_exit; goto err_exit;
} }
if(OMPI_GROUP_IS_DENSE(local_comm->c_local_group)) { if(OMPI_GROUP_IS_DENSE(local_comm->c_local_group)) {
@ -1595,6 +1601,7 @@ ompi_proc_t **ompi_comm_get_rprocs ( ompi_communicator_t *local_comm,
/* Allocate temporary buffer */ /* Allocate temporary buffer */
recvbuf = (char *)malloc(rlen); recvbuf = (char *)malloc(rlen);
if ( NULL == recvbuf ) { if ( NULL == recvbuf ) {
rc = OMPI_ERR_OUT_OF_RESOURCE;
goto err_exit; goto err_exit;
} }
@ -1626,7 +1633,7 @@ ompi_proc_t **ompi_comm_get_rprocs ( ompi_communicator_t *local_comm,
rbuf = OBJ_NEW(opal_buffer_t); rbuf = OBJ_NEW(opal_buffer_t);
if (NULL == rbuf) { if (NULL == rbuf) {
rc = OMPI_ERROR; rc = OMPI_ERR_OUT_OF_RESOURCE;
goto err_exit; goto err_exit;
} }
@ -1634,11 +1641,12 @@ ompi_proc_t **ompi_comm_get_rprocs ( ompi_communicator_t *local_comm,
goto err_exit; goto err_exit;
} }
/* decode the names into a proc-list */ /* decode the names into a proc-list -- will never add a new proc
as the result of this operation, so no need to get the newprocs
list or call PML add_procs(). */
rc = ompi_proc_unpack(rbuf, rsize, &rprocs, NULL, NULL); rc = ompi_proc_unpack(rbuf, rsize, &rprocs, NULL, NULL);
OBJ_RELEASE(rbuf); OBJ_RELEASE(rbuf);
if (OMPI_SUCCESS != rc) { if (OMPI_SUCCESS != rc) {
OMPI_ERROR_LOG(rc);
goto err_exit; goto err_exit;
} }
@ -1658,7 +1666,6 @@ ompi_proc_t **ompi_comm_get_rprocs ( ompi_communicator_t *local_comm,
/* And now add the information into the database */ /* And now add the information into the database */
if (OMPI_SUCCESS != (rc = MCA_PML_CALL(add_procs(rprocs, rsize)))) { if (OMPI_SUCCESS != (rc = MCA_PML_CALL(add_procs(rprocs, rsize)))) {
OMPI_ERROR_LOG(rc);
goto err_exit; goto err_exit;
} }
@ -1666,6 +1673,7 @@ ompi_proc_t **ompi_comm_get_rprocs ( ompi_communicator_t *local_comm,
/* rprocs isn't freed unless we have an error, /* rprocs isn't freed unless we have an error,
since it is used in the communicator */ since it is used in the communicator */
if ( OMPI_SUCCESS != rc ) { if ( OMPI_SUCCESS != rc ) {
OMPI_ERROR_LOG(rc);
opal_output(0, "%d: Error in ompi_get_rprocs\n", local_rank); opal_output(0, "%d: Error in ompi_get_rprocs\n", local_rank);
if ( NULL != rprocs ) { if ( NULL != rprocs ) {
free ( rprocs ); free ( rprocs );
@ -1686,7 +1694,8 @@ ompi_proc_t **ompi_comm_get_rprocs ( ompi_communicator_t *local_comm,
free ( sendbuf ); free ( sendbuf );
} }
return rprocs; *prprocs = rprocs;
return rc;
} }
/**********************************************************************/ /**********************************************************************/
/**********************************************************************/ /**********************************************************************/

Просмотреть файл

@ -370,6 +370,13 @@ static int ompi_comm_checkcid (ompi_comm_request_t *request)
int ret; int ret;
int participate = (context->newcomm->c_local_group->grp_my_rank != MPI_UNDEFINED); int participate = (context->newcomm->c_local_group->grp_my_rank != MPI_UNDEFINED);
if (OMPI_SUCCESS != request->super.req_status.MPI_ERROR) {
if (participate) {
opal_pointer_array_set_item(&ompi_mpi_communicators, context->nextlocal_cid, NULL);
}
return request->super.req_status.MPI_ERROR;
}
if (OPAL_THREAD_TRYLOCK(&ompi_cid_lock)) { if (OPAL_THREAD_TRYLOCK(&ompi_cid_lock)) {
return ompi_comm_request_schedule_append (request, ompi_comm_checkcid, NULL, 0); return ompi_comm_request_schedule_append (request, ompi_comm_checkcid, NULL, 0);
} }
@ -407,11 +414,18 @@ static int ompi_comm_nextcid_check_flag (ompi_comm_request_t *request)
ompi_comm_cid_context_t *context = (ompi_comm_cid_context_t *) request->context; ompi_comm_cid_context_t *context = (ompi_comm_cid_context_t *) request->context;
int participate = (context->newcomm->c_local_group->grp_my_rank != MPI_UNDEFINED); int participate = (context->newcomm->c_local_group->grp_my_rank != MPI_UNDEFINED);
if (OMPI_SUCCESS != request->super.req_status.MPI_ERROR) {
if (participate) {
opal_pointer_array_set_item(&ompi_mpi_communicators, context->nextcid, NULL);
}
return request->super.req_status.MPI_ERROR;
}
if (OPAL_THREAD_TRYLOCK(&ompi_cid_lock)) { if (OPAL_THREAD_TRYLOCK(&ompi_cid_lock)) {
return ompi_comm_request_schedule_append (request, ompi_comm_nextcid_check_flag, NULL, 0); return ompi_comm_request_schedule_append (request, ompi_comm_nextcid_check_flag, NULL, 0);
} }
if (1 == context->rflag) { if (0 != context->rflag) {
if( !participate ) { if( !participate ) {
/* we need to provide something sane here /* we need to provide something sane here
* but we cannot use `nextcid` as we may have it * but we cannot use `nextcid` as we may have it
@ -442,7 +456,7 @@ static int ompi_comm_nextcid_check_flag (ompi_comm_request_t *request)
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
if (participate && (1 == context->flag)) { if (participate && (0 != context->flag)) {
/* we could use this cid, but other don't agree */ /* we could use this cid, but other don't agree */
opal_pointer_array_set_item (&ompi_mpi_communicators, context->nextcid, NULL); opal_pointer_array_set_item (&ompi_mpi_communicators, context->nextcid, NULL);
context->start = context->nextcid + 1; /* that's where we can start the next round */ context->start = context->nextcid + 1; /* that's where we can start the next round */

Просмотреть файл

@ -119,6 +119,11 @@ static int ompi_comm_request_progress (void)
while (request_item->subreq_count) { while (request_item->subreq_count) {
ompi_request_t *subreq = request_item->subreqs[request_item->subreq_count-1]; ompi_request_t *subreq = request_item->subreqs[request_item->subreq_count-1];
if( REQUEST_COMPLETE(subreq) ) { if( REQUEST_COMPLETE(subreq) ) {
if (OMPI_SUCCESS != subreq->req_status.MPI_ERROR) {
/* Let it continue but mark it as failed, so
* that it does some subreqs cleanup */
request->super.req_status.MPI_ERROR = subreq->req_status.MPI_ERROR;
}
ompi_request_free (&subreq); ompi_request_free (&subreq);
request_item->subreq_count--; request_item->subreq_count--;
} else { } else {
@ -130,6 +135,8 @@ static int ompi_comm_request_progress (void)
if (item_complete) { if (item_complete) {
if (request_item->callback) { if (request_item->callback) {
opal_mutex_unlock (&ompi_comm_request_mutex); opal_mutex_unlock (&ompi_comm_request_mutex);
/* the callback should check for errors in the request
* status. */
rc = request_item->callback (request); rc = request_item->callback (request);
opal_mutex_lock (&ompi_comm_request_mutex); opal_mutex_lock (&ompi_comm_request_mutex);
} }
@ -142,7 +149,7 @@ static int ompi_comm_request_progress (void)
/* if the request schedule is empty then the request is complete */ /* if the request schedule is empty then the request is complete */
if (0 == opal_list_get_size (&request->schedule)) { if (0 == opal_list_get_size (&request->schedule)) {
opal_list_remove_item (&ompi_comm_requests_active, (opal_list_item_t *) request); opal_list_remove_item (&ompi_comm_requests_active, (opal_list_item_t *) request);
request->super.req_status.MPI_ERROR = (OMPI_SUCCESS == rc) ? MPI_SUCCESS : MPI_ERR_INTERN; request->super.req_status.MPI_ERROR = (OMPI_SUCCESS == rc) ? MPI_SUCCESS : rc;
ompi_request_complete (&request->super, true); ompi_request_complete (&request->super, true);
} }
} }
@ -171,6 +178,7 @@ void ompi_comm_request_start (ompi_comm_request_t *request)
} }
request->super.req_state = OMPI_REQUEST_ACTIVE; request->super.req_state = OMPI_REQUEST_ACTIVE;
request->super.req_status.MPI_ERROR = OMPI_SUCCESS;
opal_mutex_unlock (&ompi_comm_request_mutex); opal_mutex_unlock (&ompi_comm_request_mutex);
} }

Просмотреть файл

@ -649,12 +649,13 @@ OMPI_DECLSPEC int ompi_comm_set_nb ( ompi_communicator_t **ncomm,
* The routine makes sure, that all processes have afterwards * The routine makes sure, that all processes have afterwards
* a list of ompi_proc_t pointers for the remote group. * a list of ompi_proc_t pointers for the remote group.
*/ */
struct ompi_proc_t **ompi_comm_get_rprocs ( ompi_communicator_t *local_comm, int ompi_comm_get_rprocs ( ompi_communicator_t *local_comm,
ompi_communicator_t *bridge_comm, ompi_communicator_t *bridge_comm,
int local_leader, int local_leader,
int remote_leader, int remote_leader,
int tag, int tag,
int rsize); int rsize,
struct ompi_proc_t ***prprocs );
/** /**
* This routine verifies, whether local_group and remote group are overlapping * This routine verifies, whether local_group and remote group are overlapping

Просмотреть файл

@ -160,6 +160,7 @@ int ompi_errhandler_request_invoke(int count,
/* Invoke the exception */ /* Invoke the exception */
switch (type) { switch (type) {
case OMPI_REQUEST_PML: case OMPI_REQUEST_PML:
case OMPI_REQUEST_COLL:
return ompi_errhandler_invoke(mpi_object.comm->error_handler, return ompi_errhandler_invoke(mpi_object.comm->error_handler,
mpi_object.comm, mpi_object.comm,
mpi_object.comm->errhandler_type, mpi_object.comm->errhandler_type,

Просмотреть файл

@ -137,9 +137,9 @@ int MPI_Intercomm_create(MPI_Comm local_comm, int local_leader,
goto err_exit; goto err_exit;
} }
rprocs = ompi_comm_get_rprocs( local_comm, bridge_comm, lleader, rc = ompi_comm_get_rprocs( local_comm, bridge_comm, lleader,
remote_leader, tag, rsize ); remote_leader, tag, rsize, &rprocs );
if ( NULL == rprocs ) { if ( OMPI_SUCCESS != rc ) {
goto err_exit; goto err_exit;
} }
@ -222,7 +222,7 @@ int MPI_Intercomm_create(MPI_Comm local_comm, int local_leader,
} }
if ( OMPI_SUCCESS != rc ) { if ( OMPI_SUCCESS != rc ) {
*newintercomm = MPI_COMM_NULL; *newintercomm = MPI_COMM_NULL;
return OMPI_ERRHANDLER_INVOKE(local_comm, MPI_ERR_INTERN, return OMPI_ERRHANDLER_INVOKE(local_comm, rc,
FUNC_NAME); FUNC_NAME);
} }