Merge pull request #6087 from ICLDisco/export/errors_cid
Manage errors in communicator creations (cid)
Этот коммит содержится в:
Коммит
e54496bf2a
@ -121,10 +121,10 @@ int ompi_comm_set ( ompi_communicator_t **ncomm,
|
||||
}
|
||||
|
||||
if (NULL != req) {
|
||||
ompi_request_wait( &req, MPI_STATUS_IGNORE);
|
||||
rc = ompi_request_wait( &req, MPI_STATUS_IGNORE);
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
return rc;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1007,6 +1007,7 @@ int ompi_comm_dup_with_info ( ompi_communicator_t * comm, opal_info_t *info, omp
|
||||
/* Determine context id. It is identical to f_2_c_handle */
|
||||
rc = ompi_comm_nextcid (newcomp, comm, NULL, NULL, NULL, false, mode);
|
||||
if ( OMPI_SUCCESS != rc ) {
|
||||
OBJ_RELEASE(newcomp);
|
||||
return rc;
|
||||
}
|
||||
|
||||
@ -1023,6 +1024,7 @@ int ompi_comm_dup_with_info ( ompi_communicator_t * comm, opal_info_t *info, omp
|
||||
/* activate communicator and init coll-module */
|
||||
rc = ompi_comm_activate (&newcomp, comm, NULL, NULL, NULL, false, mode);
|
||||
if ( OMPI_SUCCESS != rc ) {
|
||||
OBJ_RELEASE(newcomp);
|
||||
return rc;
|
||||
}
|
||||
|
||||
@ -1139,6 +1141,7 @@ static int ompi_comm_idup_getcid (ompi_comm_request_t *request)
|
||||
NULL, false, mode, subreq);
|
||||
if (OMPI_SUCCESS != rc) {
|
||||
ompi_comm_request_return (request);
|
||||
OBJ_RELEASE(context->newcomp);
|
||||
return rc;
|
||||
}
|
||||
|
||||
@ -1167,6 +1170,7 @@ static int ompi_comm_idup_with_info_activate (ompi_comm_request_t *request)
|
||||
/* activate communicator and init coll-module */
|
||||
rc = ompi_comm_activate_nb (&context->newcomp, context->comm, NULL, NULL, NULL, false, mode, subreq);
|
||||
if ( OMPI_SUCCESS != rc ) {
|
||||
OBJ_RELEASE(context->newcomp);
|
||||
return rc;
|
||||
}
|
||||
|
||||
@ -1209,6 +1213,7 @@ int ompi_comm_create_group (ompi_communicator_t *comm, ompi_group_t *group, int
|
||||
/* Determine context id. It is identical to f_2_c_handle */
|
||||
rc = ompi_comm_nextcid (newcomp, comm, NULL, &tag, NULL, false, mode);
|
||||
if ( OMPI_SUCCESS != rc ) {
|
||||
OBJ_RELEASE(newcomp);
|
||||
return rc;
|
||||
}
|
||||
|
||||
@ -1219,6 +1224,7 @@ int ompi_comm_create_group (ompi_communicator_t *comm, ompi_group_t *group, int
|
||||
/* activate communicator and init coll-module */
|
||||
rc = ompi_comm_activate (&newcomp, comm, NULL, &tag, NULL, false, mode);
|
||||
if ( OMPI_SUCCESS != rc ) {
|
||||
OBJ_RELEASE(newcomp);
|
||||
return rc;
|
||||
}
|
||||
|
||||
@ -1516,16 +1522,16 @@ int ompi_comm_free( ompi_communicator_t **comm )
|
||||
/**********************************************************************/
|
||||
/**********************************************************************/
|
||||
/**********************************************************************/
|
||||
ompi_proc_t **ompi_comm_get_rprocs ( ompi_communicator_t *local_comm,
|
||||
ompi_communicator_t *bridge_comm,
|
||||
int local_leader,
|
||||
int remote_leader,
|
||||
int tag,
|
||||
int rsize)
|
||||
int ompi_comm_get_rprocs ( ompi_communicator_t *local_comm,
|
||||
ompi_communicator_t *bridge_comm,
|
||||
int local_leader,
|
||||
int remote_leader,
|
||||
int tag,
|
||||
int rsize,
|
||||
ompi_proc_t ***prprocs )
|
||||
{
|
||||
|
||||
MPI_Request req;
|
||||
int rc;
|
||||
int rc = OMPI_SUCCESS;
|
||||
int local_rank, local_size;
|
||||
ompi_proc_t **rprocs=NULL;
|
||||
int32_t size_len;
|
||||
@ -1542,7 +1548,7 @@ ompi_proc_t **ompi_comm_get_rprocs ( ompi_communicator_t *local_comm,
|
||||
if (local_rank == local_leader) {
|
||||
sbuf = OBJ_NEW(opal_buffer_t);
|
||||
if (NULL == sbuf) {
|
||||
rc = OMPI_ERROR;
|
||||
rc = OMPI_ERR_OUT_OF_RESOURCE;
|
||||
goto err_exit;
|
||||
}
|
||||
if(OMPI_GROUP_IS_DENSE(local_comm->c_local_group)) {
|
||||
@ -1594,6 +1600,7 @@ ompi_proc_t **ompi_comm_get_rprocs ( ompi_communicator_t *local_comm,
|
||||
/* Allocate temporary buffer */
|
||||
recvbuf = (char *)malloc(rlen);
|
||||
if ( NULL == recvbuf ) {
|
||||
rc = OMPI_ERR_OUT_OF_RESOURCE;
|
||||
goto err_exit;
|
||||
}
|
||||
|
||||
@ -1625,7 +1632,7 @@ ompi_proc_t **ompi_comm_get_rprocs ( ompi_communicator_t *local_comm,
|
||||
|
||||
rbuf = OBJ_NEW(opal_buffer_t);
|
||||
if (NULL == rbuf) {
|
||||
rc = OMPI_ERROR;
|
||||
rc = OMPI_ERR_OUT_OF_RESOURCE;
|
||||
goto err_exit;
|
||||
}
|
||||
|
||||
@ -1633,11 +1640,12 @@ ompi_proc_t **ompi_comm_get_rprocs ( ompi_communicator_t *local_comm,
|
||||
goto err_exit;
|
||||
}
|
||||
|
||||
/* decode the names into a proc-list */
|
||||
/* decode the names into a proc-list -- will never add a new proc
|
||||
as the result of this operation, so no need to get the newprocs
|
||||
list or call PML add_procs(). */
|
||||
rc = ompi_proc_unpack(rbuf, rsize, &rprocs, NULL, NULL);
|
||||
OBJ_RELEASE(rbuf);
|
||||
if (OMPI_SUCCESS != rc) {
|
||||
OMPI_ERROR_LOG(rc);
|
||||
goto err_exit;
|
||||
}
|
||||
|
||||
@ -1657,7 +1665,6 @@ ompi_proc_t **ompi_comm_get_rprocs ( ompi_communicator_t *local_comm,
|
||||
|
||||
/* And now add the information into the database */
|
||||
if (OMPI_SUCCESS != (rc = MCA_PML_CALL(add_procs(rprocs, rsize)))) {
|
||||
OMPI_ERROR_LOG(rc);
|
||||
goto err_exit;
|
||||
}
|
||||
|
||||
@ -1665,6 +1672,7 @@ ompi_proc_t **ompi_comm_get_rprocs ( ompi_communicator_t *local_comm,
|
||||
/* rprocs isn't freed unless we have an error,
|
||||
since it is used in the communicator */
|
||||
if ( OMPI_SUCCESS != rc ) {
|
||||
OMPI_ERROR_LOG(rc);
|
||||
opal_output(0, "%d: Error in ompi_get_rprocs\n", local_rank);
|
||||
if ( NULL != rprocs ) {
|
||||
free ( rprocs );
|
||||
@ -1685,7 +1693,8 @@ ompi_proc_t **ompi_comm_get_rprocs ( ompi_communicator_t *local_comm,
|
||||
free ( sendbuf );
|
||||
}
|
||||
|
||||
return rprocs;
|
||||
*prprocs = rprocs;
|
||||
return rc;
|
||||
}
|
||||
/**********************************************************************/
|
||||
/**********************************************************************/
|
||||
|
@ -372,6 +372,13 @@ static int ompi_comm_checkcid (ompi_comm_request_t *request)
|
||||
int ret;
|
||||
int participate = (context->newcomm->c_local_group->grp_my_rank != MPI_UNDEFINED);
|
||||
|
||||
if (OMPI_SUCCESS != request->super.req_status.MPI_ERROR) {
|
||||
if (participate) {
|
||||
opal_pointer_array_set_item(&ompi_mpi_communicators, context->nextlocal_cid, NULL);
|
||||
}
|
||||
return request->super.req_status.MPI_ERROR;
|
||||
}
|
||||
|
||||
if (OPAL_THREAD_TRYLOCK(&ompi_cid_lock)) {
|
||||
return ompi_comm_request_schedule_append (request, ompi_comm_checkcid, NULL, 0);
|
||||
}
|
||||
@ -409,11 +416,18 @@ static int ompi_comm_nextcid_check_flag (ompi_comm_request_t *request)
|
||||
ompi_comm_cid_context_t *context = (ompi_comm_cid_context_t *) request->context;
|
||||
int participate = (context->newcomm->c_local_group->grp_my_rank != MPI_UNDEFINED);
|
||||
|
||||
if (OMPI_SUCCESS != request->super.req_status.MPI_ERROR) {
|
||||
if (participate) {
|
||||
opal_pointer_array_set_item(&ompi_mpi_communicators, context->nextcid, NULL);
|
||||
}
|
||||
return request->super.req_status.MPI_ERROR;
|
||||
}
|
||||
|
||||
if (OPAL_THREAD_TRYLOCK(&ompi_cid_lock)) {
|
||||
return ompi_comm_request_schedule_append (request, ompi_comm_nextcid_check_flag, NULL, 0);
|
||||
}
|
||||
|
||||
if (1 == context->rflag) {
|
||||
if (0 != context->rflag) {
|
||||
if( !participate ) {
|
||||
/* we need to provide something sane here
|
||||
* but we cannot use `nextcid` as we may have it
|
||||
@ -444,7 +458,7 @@ static int ompi_comm_nextcid_check_flag (ompi_comm_request_t *request)
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
if (participate && (1 == context->flag)) {
|
||||
if (participate && (0 != context->flag)) {
|
||||
/* we could use this cid, but other don't agree */
|
||||
opal_pointer_array_set_item (&ompi_mpi_communicators, context->nextcid, NULL);
|
||||
context->start = context->nextcid + 1; /* that's where we can start the next round */
|
||||
|
@ -119,6 +119,11 @@ static int ompi_comm_request_progress (void)
|
||||
while (request_item->subreq_count) {
|
||||
ompi_request_t *subreq = request_item->subreqs[request_item->subreq_count-1];
|
||||
if( REQUEST_COMPLETE(subreq) ) {
|
||||
if (OMPI_SUCCESS != subreq->req_status.MPI_ERROR) {
|
||||
/* Let it continue but mark it as failed, so
|
||||
* that it does some subreqs cleanup */
|
||||
request->super.req_status.MPI_ERROR = subreq->req_status.MPI_ERROR;
|
||||
}
|
||||
ompi_request_free (&subreq);
|
||||
request_item->subreq_count--;
|
||||
} else {
|
||||
@ -130,6 +135,8 @@ static int ompi_comm_request_progress (void)
|
||||
if (item_complete) {
|
||||
if (request_item->callback) {
|
||||
opal_mutex_unlock (&ompi_comm_request_mutex);
|
||||
/* the callback should check for errors in the request
|
||||
* status. */
|
||||
rc = request_item->callback (request);
|
||||
opal_mutex_lock (&ompi_comm_request_mutex);
|
||||
}
|
||||
@ -142,7 +149,7 @@ static int ompi_comm_request_progress (void)
|
||||
/* if the request schedule is empty then the request is complete */
|
||||
if (0 == opal_list_get_size (&request->schedule)) {
|
||||
opal_list_remove_item (&ompi_comm_requests_active, (opal_list_item_t *) request);
|
||||
request->super.req_status.MPI_ERROR = (OMPI_SUCCESS == rc) ? MPI_SUCCESS : MPI_ERR_INTERN;
|
||||
request->super.req_status.MPI_ERROR = (OMPI_SUCCESS == rc) ? MPI_SUCCESS : rc;
|
||||
ompi_request_complete (&request->super, true);
|
||||
}
|
||||
}
|
||||
@ -171,6 +178,7 @@ void ompi_comm_request_start (ompi_comm_request_t *request)
|
||||
}
|
||||
|
||||
request->super.req_state = OMPI_REQUEST_ACTIVE;
|
||||
request->super.req_status.MPI_ERROR = OMPI_SUCCESS;
|
||||
|
||||
opal_mutex_unlock (&ompi_comm_request_mutex);
|
||||
}
|
||||
|
@ -649,12 +649,13 @@ OMPI_DECLSPEC int ompi_comm_set_nb ( ompi_communicator_t **ncomm,
|
||||
* The routine makes sure, that all processes have afterwards
|
||||
* a list of ompi_proc_t pointers for the remote group.
|
||||
*/
|
||||
struct ompi_proc_t **ompi_comm_get_rprocs ( ompi_communicator_t *local_comm,
|
||||
ompi_communicator_t *bridge_comm,
|
||||
int local_leader,
|
||||
int remote_leader,
|
||||
int tag,
|
||||
int rsize);
|
||||
int ompi_comm_get_rprocs ( ompi_communicator_t *local_comm,
|
||||
ompi_communicator_t *bridge_comm,
|
||||
int local_leader,
|
||||
int remote_leader,
|
||||
int tag,
|
||||
int rsize,
|
||||
struct ompi_proc_t ***prprocs );
|
||||
|
||||
/**
|
||||
* This routine verifies, whether local_group and remote group are overlapping
|
||||
|
@ -160,6 +160,7 @@ int ompi_errhandler_request_invoke(int count,
|
||||
/* Invoke the exception */
|
||||
switch (type) {
|
||||
case OMPI_REQUEST_PML:
|
||||
case OMPI_REQUEST_COLL:
|
||||
return ompi_errhandler_invoke(mpi_object.comm->error_handler,
|
||||
mpi_object.comm,
|
||||
mpi_object.comm->errhandler_type,
|
||||
|
@ -137,9 +137,9 @@ int MPI_Intercomm_create(MPI_Comm local_comm, int local_leader,
|
||||
goto err_exit;
|
||||
}
|
||||
|
||||
rprocs = ompi_comm_get_rprocs( local_comm, bridge_comm, lleader,
|
||||
remote_leader, tag, rsize );
|
||||
if ( NULL == rprocs ) {
|
||||
rc = ompi_comm_get_rprocs( local_comm, bridge_comm, lleader,
|
||||
remote_leader, tag, rsize, &rprocs );
|
||||
if ( OMPI_SUCCESS != rc ) {
|
||||
goto err_exit;
|
||||
}
|
||||
|
||||
@ -222,7 +222,7 @@ int MPI_Intercomm_create(MPI_Comm local_comm, int local_leader,
|
||||
}
|
||||
if ( OMPI_SUCCESS != rc ) {
|
||||
*newintercomm = MPI_COMM_NULL;
|
||||
return OMPI_ERRHANDLER_INVOKE(local_comm, MPI_ERR_INTERN,
|
||||
return OMPI_ERRHANDLER_INVOKE(local_comm, rc,
|
||||
FUNC_NAME);
|
||||
}
|
||||
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user