/* Contributed by Marcia Cristina Cera , http://www.open-mpi.org/community/lists/users/2009/12/11540.php */ /* It was decided that the issue highlighted by this test will NOT be fixed in the 1.3/1.4 series. It is already fixed in the 1.5 series. Hence, if we detect Open MPI < v1.5, return 77/skip. */ /* Turns out the hnp cannot handle concurrent MPI_Comm_spawns as of Open MPI 1.7. However, we hope this feature will work in 2.0. with the new state machine based orte. */ #include #include #include #include #include #include #include #define NCHARS 30 const int max_depth = 4; /* * Here are some replacements for standard, blocking MPI * functions. These replacements are "nice" and yield the * CPU instead of spinning hard. The interfaces are the same. * Just replace: * MPI_Recv with nice_recv * MPI_Send with nice_send * MPI_Barrier with nice_barrier */ static int nice_send(void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm) { /* Assume a standard (presumably short/eager) send suffices. */ return MPI_Send(buf, count, datatype, dest, tag, comm); } static int nice_recv(void *buf, int count, MPI_Datatype datatype, int source, int tag, MPI_Comm comm, MPI_Status *status) { MPI_Request req; int flag; struct timespec dt; /* * We're only interested in modest levels of oversubscription * -- e.g., 2-4x more processes than physical processors. * So, the sleep time only needs to be about 2-4x longer than * a futile MPI_Test call. For a wide range of processors, * something less than a millisecond should be sufficient. * Excessive sleep times (e.g., 1 second) would degrade performance. */ dt.tv_sec = 0; dt.tv_nsec = 100000; MPI_Irecv(buf, count, datatype, source, tag, comm, &req); MPI_Test(&req, &flag, status); while ( ! flag ) { nanosleep(&dt, NULL); MPI_Test(&req, &flag, status); } return MPI_SUCCESS; } static void nice_barrier(MPI_Comm comm) { int me, np, jump, buf = -1; MPI_Comm_rank(comm,&me); MPI_Comm_size(comm,&np); /* fan in */ for ( jump = 1; jump < np; jump <<= 1 ) { if ( ( me & jump ) != 0 ) { nice_send(&buf, 1, MPI_INT, me - jump, 343, comm); break; } else if ( me + jump < np ) { nice_recv(&buf, 1, MPI_INT, me + jump, 343, comm, MPI_STATUS_IGNORE); } } /* fan out */ if ( 0 != me ) { nice_recv(&buf, 1, MPI_INT, me - jump, 344, comm, MPI_STATUS_IGNORE); } jump >>= 1; for ( ; jump > 0; jump >>= 1 ) { if ( me + jump < np ) { nice_send(&buf, 1, MPI_INT, me + jump, 344, comm); } } } int main (int argc, char **argv) { char bufs [NCHARS]; /* send buffer */ char bufr[2][NCHARS]; /* recv buffers */ MPI_Comm parent; int level = 0, participate = 1; struct utsname buf; /* If this is prior to OMPI v2.0, return 77/skip */ #if defined(OPEN_MPI) if (OMPI_MAJOR_VERSION < 2) { printf("Skipping, because the orte cannot handle concurrent MPI_Comm_spawns\n"); return 77; } else { printf("Verify that this test is truly working because conncurrent MPI_Comm_spawns" " has not worked before.\n"); } #endif uname(&buf); printf("I AM pid %d with level %d on %s\n", getpid(), (argc < 2)?0:atoi(argv[1]), buf.nodename); MPI_Init(&argc, &argv); MPI_Comm_get_parent(&parent); if (MPI_COMM_NULL != parent) { /* spawned processes get stuff from parent */ level = atoi(argv[1]); MPI_Recv(&bufr[0], sizeof(char)*NCHARS, MPI_CHAR, MPI_ANY_SOURCE, MPI_ANY_TAG, parent, MPI_STATUS_IGNORE); printf("Parent sent: %s\n", bufr[0]); } else { /* original processes have to decide whether to participate */ /* In this test, each process launched by "mpirun -n " spawns a * binary tree of processes. You end up with * ( 1 << max_depth ) * processes altogether. For max_depth=4, this means 16*. There * is potential here for heavy oversubscription, especially if in * testing we launch tests with set to the number of available * processors. This test tolerates oversubscription somewhat since * it entails little inter-process synchronization. Nevertheless, * we try to idle all but /4 of the original processes, using a * minimum of at least two processes */ int me, np; MPI_Comm_size(MPI_COMM_WORLD,&np); MPI_Comm_rank(MPI_COMM_WORLD,&me); if ( np > 4 ) { /* turn off all but every 4th process */ if ( ( me & 3 ) != 0 ) participate = 0; } else if ( np > 2 ) { /* turn off all but every 2nd process */ if ( ( me & 1 ) != 0 ) participate = 0; } } /* all spawned processes and selected "root" processes participate */ if ( participate ) { printf("level = %d\n", level); /* prepare send buffer */ sprintf(bufs,"level %d (pid:%d)", level, getpid()); /* spawn */ if (level < max_depth) { int i, nspawn = 2, errcodes[1]; MPI_Request req[2]; MPI_Comm comm[2]; char argv1[NCHARS]; char *args[2]; /* level 0 spawns only one process to mimic the original test */ if ( level == 0 ) nspawn = 1; /* prepare command line arguments */ snprintf(argv1, sizeof(argv1), "%d", level+1); args[0] = argv1; args[1] = NULL; /* spawn, with a message sent to and received from each child */ for ( i = 0; i < nspawn; i++ ) { MPI_Comm_spawn(argv[0], args, 1, MPI_INFO_NULL, 0, MPI_COMM_SELF, &comm[i], errcodes); MPI_Send(&bufs, sizeof(char)*NCHARS, MPI_CHAR, 0, 100, comm[i]); MPI_Irecv(&bufr[i], sizeof(char)*NCHARS, MPI_CHAR, MPI_ANY_SOURCE, MPI_ANY_TAG, comm[i], &req[i]); } /* wait for messages from children and print them */ MPI_Waitall(nspawn, req, MPI_STATUSES_IGNORE); for ( i = 0; i < nspawn; i++ ) printf("Child %d sent: %s\n", i, bufr[i]); } /* send message back to parent */ if (MPI_COMM_NULL != parent) { MPI_Send(&bufs, sizeof(char)*NCHARS, MPI_CHAR, 0, 100, parent); } } /* non-participating processes wait at this barrier for their peers */ /* (This barrier won't cost that many CPU cycles.) */ if (MPI_COMM_NULL == parent) { nice_barrier(MPI_COMM_WORLD); } MPI_Finalize(); return 0; }