1
1

Add a tester for the ERRORS_ABORT and communicator abort features

Signed-off-by: Aurélien Bouteiller <bouteill@icl.utk.edu>
Этот коммит содержится в:
Aurélien Bouteiller 2020-05-05 14:23:47 -04:00 коммит произвёл Aurelien Bouteiller
родитель 9c22ad84af
Коммит e2f53b76fb
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 08F60797C5941DB2
2 изменённых файлов: 175 добавлений и 1 удалений

Просмотреть файл

@ -1,4 +1,4 @@
PROGS = mpi_no_op mpi_barrier hello hello_nodename abort multi_abort simple_spawn \
PROGS = mpi_no_op mpi_barrier hello hello_nodename abort multi_abort comm_abort simple_spawn \
concurrent_spawn spawn_multiple mpi_spin delayed_abort loop_spawn loop_child \
bad_exit pubsub hello_barrier segv accept connect hello_output hello_show_help \
crisscross read_write ziatest slave reduce-hang ziaprobe ziatest bcast_loop \

174
test/simple/comm_abort.c Обычный файл
Просмотреть файл

@ -0,0 +1,174 @@
/* -*- C -*-
* Copyright (c) 2020 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
* Test aborting communicators
*/
#include <stdio.h>
#include <unistd.h>
#include "mpi.h"
#define print1(format...) if(0 == rank) printf(format)
int main(int argc, char* argv[])
{
int rank, size, more;
double start, now;
MPI_Comm comm_pair_fatal, comm_pair_return, comm_pair_abort;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
if(0 == rank && size%2) {
fprintf(stderr, "This test requires an even number of processes\n\n");
MPI_Abort(MPI_COMM_WORLD, size);
}
/* Setup: split our world in a set of 2-processes islands */
MPI_Comm_split(MPI_COMM_WORLD, rank/2, rank, &comm_pair_fatal);
MPI_Comm_set_errhandler(comm_pair_fatal, MPI_ERRORS_ARE_FATAL);
MPI_Comm_split(MPI_COMM_WORLD, rank/2, rank, &comm_pair_return);
MPI_Comm_set_errhandler(comm_pair_return, MPI_ERRORS_RETURN);
MPI_Comm_split(MPI_COMM_WORLD, rank/2, rank, &comm_pair_abort);
/* If this code fails to compile, the MPI implementation is not compliant
* with MPI-4 (TODO: add ref to chapter/line when MPI-4 published). */
MPI_Comm_set_errhandler(comm_pair_abort, MPI_ERRORS_ABORT);
MPI_Barrier(MPI_COMM_WORLD);
print1(
"This program will test partial abort functionality (communicator scoped abort).\n"
" Each test will perform a loop of communication on a subcommunicator for about\n"
" 1 second between printouts, and then, a 1 second cooldown.\n");
print1("\n\n"
"Test1: MPI_Abort(MPI_COMM_SELF) aborts only one process?\n"
" In a high quality implementation, all ranks except %d\n"
" should report their presence.\n", 1);
if(rank == 1) {
MPI_Abort(MPI_COMM_SELF, 1);
}
/* Spin on communication for 1 second to let time for Abort to have an
* effect, if any. */
more = 1; start = MPI_Wtime();
do {
now = MPI_Wtime();
if(now - start > 1.) more = 0;
if(rank > 1) /* don't reduce on aborted pairs */
MPI_Allreduce(MPI_IN_PLACE, &more, 1, MPI_INT, MPI_MIN, comm_pair_fatal);
} while(more);
printf(" This is rank %d: still kickin after %d MPI_Abort'ed self\n", rank, 1);
sleep(1);
print1("===============================================================\n");
print1("\n\n"
"Test2: MPI_Abort(comm) aborts all processes in comm?\n"
" In a high quality implementation, all ranks except %d--%d\n"
" should report their presence.\n", 1, 3);
if(rank == 3) {
MPI_Abort(comm_pair_return, 2);
}
/* Spin on communication for 1 second to let time for Abort to have an
* effect, if any. */
more = 1; start = MPI_Wtime();
do {
now = MPI_Wtime();
if(now - start > 1.) more = 0;
if(rank > 3) /* don't reduce on aborted pairs */
MPI_Allreduce(MPI_IN_PLACE, &more, 1, MPI_INT, MPI_MIN, comm_pair_fatal);
} while(more);
printf(" This is rank %d: still kickin after %d aborted comm pair %d-%d\n", rank, 3, 2, 3);
/* This process should have aborted, give it an opportunity to do so if no
* async progress: message to self to spin MPI progress. */
if(rank == 2) {
MPI_Sendrecv(&start, 1, MPI_DOUBLE, 0, 0,
&now, 1, MPI_DOUBLE, 0, 0,
MPI_COMM_SELF, MPI_STATUS_IGNORE);
printf(" This is rank %d: ERROR: I SHOULD HAVE ABORTED!\n", 2);
}
sleep(1);
print1("===============================================================\n");
print1("\n\n"
"Test3: MPI_ERRORS_ABORT aborts all processes in comm?\n"
" In a high quality implementation, all ranks except %d--%d\n"
" should report their presence.\n", 1, 5);
if(rank == 5) {
MPI_Comm_call_errhandler(comm_pair_abort, 3);
}
/* Spin on communication for 1 second to let time for Abort to have an
* effect, if any. */
more = 1; start = MPI_Wtime();
do {
now = MPI_Wtime();
if(now - start > 1.) more = 0;
if(rank > 5) /* don't reduce on aborted pairs */
MPI_Allreduce(MPI_IN_PLACE, &more, 1, MPI_INT, MPI_MIN, comm_pair_fatal);
} while(more);
printf(" This is rank %d: still kickin after %d aborted comm pair %d-%d\n", rank, 5, 4, 5);
/* This process should have aborted, give it an opportunity to do so if no
* async progress: message to self to spin MPI progress. */
if(rank == 4) {
MPI_Sendrecv(&start, 1, MPI_DOUBLE, 0, 0,
&now, 1, MPI_DOUBLE, 0, 0,
MPI_COMM_SELF, MPI_STATUS_IGNORE);
printf(" This is rank %d: ERROR: I SHOULD HAVE ABORTED!\n", 4);
}
sleep(1);
print1("===============================================================\n");
print1("\n\n"\
"Test4: Communicating with an aborted process %d returns a good error code?\n"
" In a high quality implementation, rank %d should print an error string;\n"
" In a higher quality implementation the error should be of class\n"
" MPI_ERR_PROC_ABORTED.\n", 1, 0);
if(rank == 0) {
int err, class, slen;
char str[MPI_MAX_ERROR_STRING];
/* remember, 1 aborted in test1 */
MPI_Error_class(err, &class);
MPI_Error_string(err, str, &slen);
err = MPI_Recv(&more, 1, MPI_INT, 1, 0, comm_pair_return, MPI_STATUS_IGNORE);
printf(" This is rank %d: Recv(from=%d) returned code=%d: class=%d: %s\n", 0, 1, err, class, str);
}
sleep(1);
print1("===============================================================\n");
print1("\n\n"
"Test5: MPI_ERRORS_ARE_FATAL aborts all processes?\n");
if(rank == 0) {
MPI_Comm_call_errhandler(comm_pair_fatal, 5);
}
/* Spin on communication for 1 second to let time for Abort to have an
* effect, if any. */
more = 1; start = MPI_Wtime();
do {
now = MPI_Wtime();
if(now - start > 1.) more = 0;
if(rank > 5) /* don't reduce on aborted pairs */
MPI_Allreduce(MPI_IN_PLACE, &more, 1, MPI_INT, MPI_MIN, comm_pair_fatal);
} while(more);
MPI_Sendrecv(&start, 1, MPI_DOUBLE, 0, 0,
&now, 1, MPI_DOUBLE, 0, 0,
MPI_COMM_SELF, MPI_STATUS_IGNORE);
printf(" This is rank %d: ERROR: I SHOULD HAVE ABORTED!\n", rank);
/* Should never get there */
MPI_Finalize();
return 0;
}