1
1
openmpi/orte/test/mpi/no-disconnect.c
Ralph Castain 657e701c65 Add debug verbosity to the orte data server and pmix pub/lookup functions
Start updating the various mappers to the new procedure. Remove the stale lama component as it is now very out-of-date. Bring round_robin and PPR online, and modify the mindist component (but cannot test/debug it).

Remove unneeded test

Fix memory corruption by re-initializing variable to NULL in loop

Resolve the race condition identified by @ggouaillardet by resetting the
mapped flag within the same event where it was set. There is no need to
retain the flag beyond that point as it isn't used again.

Add a new job attribute ORTE_JOB_FULLY_DESCRIBED to indicate that all the job information (including locations and binding) is included in the launch message. Thus, the backend daemons do not need to do any map computation for the job. Use this for the seq, rankfile, and mindist mappers until someone decides to update them.

Note that this will maintain functionality, but means that users of those three mappers will see large launch messages and less performant scaling than those using the other mappers.

Have the mindist module add procs to the job's proc array as it is a fully described module

Protect the hnp-not-in-allocation case

Per path suggested by Gilles - protect the HNP node when it gets added in the absence of any other allocation or hostfile

Signed-off-by: Ralph Castain <rhc@open-mpi.org>
2017-05-25 18:41:27 -07:00

211 строки
6.9 KiB
C

/* Contributed by Marcia Cristina Cera
<marcia.cristina.cera@gmail.com>,
http://www.open-mpi.org/community/lists/users/2009/12/11540.php */
/* It was decided that the issue highlighted by this test will NOT be
fixed in the 1.3/1.4 series. It is already fixed in the 1.5
series. Hence, if we detect Open MPI < v1.5, return 77/skip. */
/* Turns out the hnp cannot handle concurrent MPI_Comm_spawns
as of Open MPI 1.7. However, we hope this feature will
work in 2.0. with the new state machine based orte. */
#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <time.h>
#include <sys/utsname.h>
#include <mpi.h>
#define NCHARS 30
const int max_depth = 4;
/*
* Here are some replacements for standard, blocking MPI
* functions. These replacements are "nice" and yield the
* CPU instead of spinning hard. The interfaces are the same.
* Just replace:
* MPI_Recv with nice_recv
* MPI_Send with nice_send
* MPI_Barrier with nice_barrier
*/
static int nice_send(void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm) {
/* Assume a standard (presumably short/eager) send suffices. */
return MPI_Send(buf, count, datatype, dest, tag, comm);
}
static int nice_recv(void *buf, int count, MPI_Datatype datatype, int source, int tag, MPI_Comm comm, MPI_Status *status) {
MPI_Request req;
int flag;
struct timespec dt;
/*
* We're only interested in modest levels of oversubscription
* -- e.g., 2-4x more processes than physical processors.
* So, the sleep time only needs to be about 2-4x longer than
* a futile MPI_Test call. For a wide range of processors,
* something less than a millisecond should be sufficient.
* Excessive sleep times (e.g., 1 second) would degrade performance.
*/
dt.tv_sec = 0;
dt.tv_nsec = 100000;
MPI_Irecv(buf, count, datatype, source, tag, comm, &req);
MPI_Test(&req, &flag, status);
while ( ! flag ) {
nanosleep(&dt, NULL);
MPI_Test(&req, &flag, status);
}
return MPI_SUCCESS;
}
static void nice_barrier(MPI_Comm comm) {
int me, np, jump, buf = -1;
MPI_Comm_rank(comm,&me);
MPI_Comm_size(comm,&np);
/* fan in */
for ( jump = 1; jump < np; jump <<= 1 ) {
if ( ( me & jump ) != 0 ) {
nice_send(&buf, 1, MPI_INT, me - jump, 343, comm);
break;
} else if ( me + jump < np ) {
nice_recv(&buf, 1, MPI_INT, me + jump, 343, comm, MPI_STATUS_IGNORE);
}
}
/* fan out */
if ( 0 != me ) {
nice_recv(&buf, 1, MPI_INT, me - jump, 344, comm, MPI_STATUS_IGNORE);
}
jump >>= 1;
for ( ; jump > 0; jump >>= 1 ) {
if ( me + jump < np ) {
nice_send(&buf, 1, MPI_INT, me + jump, 344, comm);
}
}
}
int main (int argc, char **argv)
{
char bufs [NCHARS]; /* send buffer */
char bufr[2][NCHARS]; /* recv buffers */
MPI_Comm parent;
int level = 0, participate = 1;
struct utsname buf;
/* If this is prior to OMPI v2.0, return 77/skip */
#if defined(OPEN_MPI)
if (OMPI_MAJOR_VERSION < 2) {
printf("Skipping, because the orte cannot handle concurrent MPI_Comm_spawns\n");
return 77;
} else {
printf("Verify that this test is truly working because conncurrent MPI_Comm_spawns"
" has not worked before.\n");
}
#endif
uname(&buf);
printf("I AM pid %d with level %d on %s\n", getpid(), (argc < 2)?0:atoi(argv[1]), buf.nodename);
MPI_Init(&argc, &argv);
MPI_Comm_get_parent(&parent);
if (MPI_COMM_NULL != parent) {
/* spawned processes get stuff from parent */
level = atoi(argv[1]);
MPI_Recv(&bufr[0], sizeof(char)*NCHARS, MPI_CHAR, MPI_ANY_SOURCE,
MPI_ANY_TAG, parent, MPI_STATUS_IGNORE);
printf("Parent sent: %s\n", bufr[0]);
} else {
/* original processes have to decide whether to participate */
/* In this test, each process launched by "mpirun -n <np>" spawns a
* binary tree of processes. You end up with <np> * ( 1 << max_depth )
* processes altogether. For max_depth=4, this means 16*<np>. There
* is potential here for heavy oversubscription, especially if in
* testing we launch tests with <np> set to the number of available
* processors. This test tolerates oversubscription somewhat since
* it entails little inter-process synchronization. Nevertheless,
* we try to idle all but <np>/4 of the original processes, using a
* minimum of at least two processes
*/
int me, np;
MPI_Comm_size(MPI_COMM_WORLD,&np);
MPI_Comm_rank(MPI_COMM_WORLD,&me);
if ( np > 4 ) {
/* turn off all but every 4th process */
if ( ( me & 3 ) != 0 ) participate = 0;
} else
if ( np > 2 ) {
/* turn off all but every 2nd process */
if ( ( me & 1 ) != 0 ) participate = 0;
}
}
/* all spawned processes and selected "root" processes participate */
if ( participate ) {
printf("level = %d\n", level);
/* prepare send buffer */
sprintf(bufs,"level %d (pid:%d)", level, getpid());
/* spawn */
if (level < max_depth) {
int i, nspawn = 2, errcodes[1];
MPI_Request req[2];
MPI_Comm comm[2];
char argv1[NCHARS];
char *args[2];
/* level 0 spawns only one process to mimic the original test */
if ( level == 0 ) nspawn = 1;
/* prepare command line arguments */
snprintf(argv1, sizeof(argv1), "%d", level+1);
args[0] = argv1;
args[1] = NULL;
/* spawn, with a message sent to and received from each child */
for ( i = 0; i < nspawn; i++ ) {
MPI_Comm_spawn(argv[0], args, 1, MPI_INFO_NULL, 0, MPI_COMM_SELF,
&comm[i], errcodes);
MPI_Send(&bufs, sizeof(char)*NCHARS, MPI_CHAR, 0, 100, comm[i]);
MPI_Irecv(&bufr[i], sizeof(char)*NCHARS, MPI_CHAR, MPI_ANY_SOURCE,
MPI_ANY_TAG, comm[i], &req[i]);
}
/* wait for messages from children and print them */
MPI_Waitall(nspawn, req, MPI_STATUSES_IGNORE);
for ( i = 0; i < nspawn; i++ )
printf("Child %d sent: %s\n", i, bufr[i]);
}
/* send message back to parent */
if (MPI_COMM_NULL != parent) {
MPI_Send(&bufs, sizeof(char)*NCHARS, MPI_CHAR, 0, 100, parent);
}
}
/* non-participating processes wait at this barrier for their peers */
/* (This barrier won't cost that many CPU cycles.) */
if (MPI_COMM_NULL == parent) {
nice_barrier(MPI_COMM_WORLD);
}
MPI_Finalize();
return 0;
}