1
1

Merge pull request #6087 from ICLDisco/export/errors_cid

Manage errors in communicator creations (cid)
Этот коммит содержится в:
Aurelien Bouteiller 2018-12-31 15:01:55 -05:00 коммит произвёл GitHub
родитель 17be4c6d1f 96c91e94eb
Коммит e54496bf2a
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
6 изменённых файлов: 62 добавлений и 29 удалений

Просмотреть файл

@ -121,10 +121,10 @@ int ompi_comm_set ( ompi_communicator_t **ncomm,
}
if (NULL != req) {
ompi_request_wait( &req, MPI_STATUS_IGNORE);
rc = ompi_request_wait( &req, MPI_STATUS_IGNORE);
}
return OMPI_SUCCESS;
return rc;
}
/*
@ -1007,6 +1007,7 @@ int ompi_comm_dup_with_info ( ompi_communicator_t * comm, opal_info_t *info, omp
/* Determine context id. It is identical to f_2_c_handle */
rc = ompi_comm_nextcid (newcomp, comm, NULL, NULL, NULL, false, mode);
if ( OMPI_SUCCESS != rc ) {
OBJ_RELEASE(newcomp);
return rc;
}
@ -1023,6 +1024,7 @@ int ompi_comm_dup_with_info ( ompi_communicator_t * comm, opal_info_t *info, omp
/* activate communicator and init coll-module */
rc = ompi_comm_activate (&newcomp, comm, NULL, NULL, NULL, false, mode);
if ( OMPI_SUCCESS != rc ) {
OBJ_RELEASE(newcomp);
return rc;
}
@ -1139,6 +1141,7 @@ static int ompi_comm_idup_getcid (ompi_comm_request_t *request)
NULL, false, mode, subreq);
if (OMPI_SUCCESS != rc) {
ompi_comm_request_return (request);
OBJ_RELEASE(context->newcomp);
return rc;
}
@ -1167,6 +1170,7 @@ static int ompi_comm_idup_with_info_activate (ompi_comm_request_t *request)
/* activate communicator and init coll-module */
rc = ompi_comm_activate_nb (&context->newcomp, context->comm, NULL, NULL, NULL, false, mode, subreq);
if ( OMPI_SUCCESS != rc ) {
OBJ_RELEASE(context->newcomp);
return rc;
}
@ -1209,6 +1213,7 @@ int ompi_comm_create_group (ompi_communicator_t *comm, ompi_group_t *group, int
/* Determine context id. It is identical to f_2_c_handle */
rc = ompi_comm_nextcid (newcomp, comm, NULL, &tag, NULL, false, mode);
if ( OMPI_SUCCESS != rc ) {
OBJ_RELEASE(newcomp);
return rc;
}
@ -1219,6 +1224,7 @@ int ompi_comm_create_group (ompi_communicator_t *comm, ompi_group_t *group, int
/* activate communicator and init coll-module */
rc = ompi_comm_activate (&newcomp, comm, NULL, &tag, NULL, false, mode);
if ( OMPI_SUCCESS != rc ) {
OBJ_RELEASE(newcomp);
return rc;
}
@ -1516,16 +1522,16 @@ int ompi_comm_free( ompi_communicator_t **comm )
/**********************************************************************/
/**********************************************************************/
/**********************************************************************/
ompi_proc_t **ompi_comm_get_rprocs ( ompi_communicator_t *local_comm,
ompi_communicator_t *bridge_comm,
int local_leader,
int remote_leader,
int tag,
int rsize)
int ompi_comm_get_rprocs ( ompi_communicator_t *local_comm,
ompi_communicator_t *bridge_comm,
int local_leader,
int remote_leader,
int tag,
int rsize,
ompi_proc_t ***prprocs )
{
MPI_Request req;
int rc;
int rc = OMPI_SUCCESS;
int local_rank, local_size;
ompi_proc_t **rprocs=NULL;
int32_t size_len;
@ -1542,7 +1548,7 @@ ompi_proc_t **ompi_comm_get_rprocs ( ompi_communicator_t *local_comm,
if (local_rank == local_leader) {
sbuf = OBJ_NEW(opal_buffer_t);
if (NULL == sbuf) {
rc = OMPI_ERROR;
rc = OMPI_ERR_OUT_OF_RESOURCE;
goto err_exit;
}
if(OMPI_GROUP_IS_DENSE(local_comm->c_local_group)) {
@ -1594,6 +1600,7 @@ ompi_proc_t **ompi_comm_get_rprocs ( ompi_communicator_t *local_comm,
/* Allocate temporary buffer */
recvbuf = (char *)malloc(rlen);
if ( NULL == recvbuf ) {
rc = OMPI_ERR_OUT_OF_RESOURCE;
goto err_exit;
}
@ -1625,7 +1632,7 @@ ompi_proc_t **ompi_comm_get_rprocs ( ompi_communicator_t *local_comm,
rbuf = OBJ_NEW(opal_buffer_t);
if (NULL == rbuf) {
rc = OMPI_ERROR;
rc = OMPI_ERR_OUT_OF_RESOURCE;
goto err_exit;
}
@ -1633,11 +1640,12 @@ ompi_proc_t **ompi_comm_get_rprocs ( ompi_communicator_t *local_comm,
goto err_exit;
}
/* decode the names into a proc-list */
/* decode the names into a proc-list -- will never add a new proc
as the result of this operation, so no need to get the newprocs
list or call PML add_procs(). */
rc = ompi_proc_unpack(rbuf, rsize, &rprocs, NULL, NULL);
OBJ_RELEASE(rbuf);
if (OMPI_SUCCESS != rc) {
OMPI_ERROR_LOG(rc);
goto err_exit;
}
@ -1657,7 +1665,6 @@ ompi_proc_t **ompi_comm_get_rprocs ( ompi_communicator_t *local_comm,
/* And now add the information into the database */
if (OMPI_SUCCESS != (rc = MCA_PML_CALL(add_procs(rprocs, rsize)))) {
OMPI_ERROR_LOG(rc);
goto err_exit;
}
@ -1665,6 +1672,7 @@ ompi_proc_t **ompi_comm_get_rprocs ( ompi_communicator_t *local_comm,
/* rprocs isn't freed unless we have an error,
since it is used in the communicator */
if ( OMPI_SUCCESS != rc ) {
OMPI_ERROR_LOG(rc);
opal_output(0, "%d: Error in ompi_get_rprocs\n", local_rank);
if ( NULL != rprocs ) {
free ( rprocs );
@ -1685,7 +1693,8 @@ ompi_proc_t **ompi_comm_get_rprocs ( ompi_communicator_t *local_comm,
free ( sendbuf );
}
return rprocs;
*prprocs = rprocs;
return rc;
}
/**********************************************************************/
/**********************************************************************/

Просмотреть файл

@ -372,6 +372,13 @@ static int ompi_comm_checkcid (ompi_comm_request_t *request)
int ret;
int participate = (context->newcomm->c_local_group->grp_my_rank != MPI_UNDEFINED);
if (OMPI_SUCCESS != request->super.req_status.MPI_ERROR) {
if (participate) {
opal_pointer_array_set_item(&ompi_mpi_communicators, context->nextlocal_cid, NULL);
}
return request->super.req_status.MPI_ERROR;
}
if (OPAL_THREAD_TRYLOCK(&ompi_cid_lock)) {
return ompi_comm_request_schedule_append (request, ompi_comm_checkcid, NULL, 0);
}
@ -409,11 +416,18 @@ static int ompi_comm_nextcid_check_flag (ompi_comm_request_t *request)
ompi_comm_cid_context_t *context = (ompi_comm_cid_context_t *) request->context;
int participate = (context->newcomm->c_local_group->grp_my_rank != MPI_UNDEFINED);
if (OMPI_SUCCESS != request->super.req_status.MPI_ERROR) {
if (participate) {
opal_pointer_array_set_item(&ompi_mpi_communicators, context->nextcid, NULL);
}
return request->super.req_status.MPI_ERROR;
}
if (OPAL_THREAD_TRYLOCK(&ompi_cid_lock)) {
return ompi_comm_request_schedule_append (request, ompi_comm_nextcid_check_flag, NULL, 0);
}
if (1 == context->rflag) {
if (0 != context->rflag) {
if( !participate ) {
/* we need to provide something sane here
* but we cannot use `nextcid` as we may have it
@ -444,7 +458,7 @@ static int ompi_comm_nextcid_check_flag (ompi_comm_request_t *request)
return OMPI_SUCCESS;
}
if (participate && (1 == context->flag)) {
if (participate && (0 != context->flag)) {
/* we could use this cid, but other don't agree */
opal_pointer_array_set_item (&ompi_mpi_communicators, context->nextcid, NULL);
context->start = context->nextcid + 1; /* that's where we can start the next round */

Просмотреть файл

@ -119,6 +119,11 @@ static int ompi_comm_request_progress (void)
while (request_item->subreq_count) {
ompi_request_t *subreq = request_item->subreqs[request_item->subreq_count-1];
if( REQUEST_COMPLETE(subreq) ) {
if (OMPI_SUCCESS != subreq->req_status.MPI_ERROR) {
/* Let it continue but mark it as failed, so
* that it does some subreqs cleanup */
request->super.req_status.MPI_ERROR = subreq->req_status.MPI_ERROR;
}
ompi_request_free (&subreq);
request_item->subreq_count--;
} else {
@ -130,6 +135,8 @@ static int ompi_comm_request_progress (void)
if (item_complete) {
if (request_item->callback) {
opal_mutex_unlock (&ompi_comm_request_mutex);
/* the callback should check for errors in the request
* status. */
rc = request_item->callback (request);
opal_mutex_lock (&ompi_comm_request_mutex);
}
@ -142,7 +149,7 @@ static int ompi_comm_request_progress (void)
/* if the request schedule is empty then the request is complete */
if (0 == opal_list_get_size (&request->schedule)) {
opal_list_remove_item (&ompi_comm_requests_active, (opal_list_item_t *) request);
request->super.req_status.MPI_ERROR = (OMPI_SUCCESS == rc) ? MPI_SUCCESS : MPI_ERR_INTERN;
request->super.req_status.MPI_ERROR = (OMPI_SUCCESS == rc) ? MPI_SUCCESS : rc;
ompi_request_complete (&request->super, true);
}
}
@ -171,6 +178,7 @@ void ompi_comm_request_start (ompi_comm_request_t *request)
}
request->super.req_state = OMPI_REQUEST_ACTIVE;
request->super.req_status.MPI_ERROR = OMPI_SUCCESS;
opal_mutex_unlock (&ompi_comm_request_mutex);
}

Просмотреть файл

@ -649,12 +649,13 @@ OMPI_DECLSPEC int ompi_comm_set_nb ( ompi_communicator_t **ncomm,
* The routine makes sure, that all processes have afterwards
* a list of ompi_proc_t pointers for the remote group.
*/
struct ompi_proc_t **ompi_comm_get_rprocs ( ompi_communicator_t *local_comm,
ompi_communicator_t *bridge_comm,
int local_leader,
int remote_leader,
int tag,
int rsize);
int ompi_comm_get_rprocs ( ompi_communicator_t *local_comm,
ompi_communicator_t *bridge_comm,
int local_leader,
int remote_leader,
int tag,
int rsize,
struct ompi_proc_t ***prprocs );
/**
* This routine verifies, whether local_group and remote group are overlapping

Просмотреть файл

@ -160,6 +160,7 @@ int ompi_errhandler_request_invoke(int count,
/* Invoke the exception */
switch (type) {
case OMPI_REQUEST_PML:
case OMPI_REQUEST_COLL:
return ompi_errhandler_invoke(mpi_object.comm->error_handler,
mpi_object.comm,
mpi_object.comm->errhandler_type,

Просмотреть файл

@ -137,9 +137,9 @@ int MPI_Intercomm_create(MPI_Comm local_comm, int local_leader,
goto err_exit;
}
rprocs = ompi_comm_get_rprocs( local_comm, bridge_comm, lleader,
remote_leader, tag, rsize );
if ( NULL == rprocs ) {
rc = ompi_comm_get_rprocs( local_comm, bridge_comm, lleader,
remote_leader, tag, rsize, &rprocs );
if ( OMPI_SUCCESS != rc ) {
goto err_exit;
}
@ -222,7 +222,7 @@ int MPI_Intercomm_create(MPI_Comm local_comm, int local_leader,
}
if ( OMPI_SUCCESS != rc ) {
*newintercomm = MPI_COMM_NULL;
return OMPI_ERRHANDLER_INVOKE(local_comm, MPI_ERR_INTERN,
return OMPI_ERRHANDLER_INVOKE(local_comm, rc,
FUNC_NAME);
}