Fix add-host support by including the location for procs of prior jobs when spawning new daemons.
Thanks to CalugaruVaxile for the report Signed-off-by: Ralph Castain <rhc@open-mpi.org>
Этот коммит содержится в:
родитель
0d1c58853b
Коммит
4316213805
1
.gitignore
поставляемый
1
.gitignore
поставляемый
@ -435,6 +435,7 @@ orte/test/mpi/badcoll
|
||||
orte/test/mpi/iof
|
||||
orte/test/mpi/no-disconnect
|
||||
orte/test/mpi/nonzero
|
||||
orte/test/mpi/add_host
|
||||
|
||||
orte/test/system/radix
|
||||
orte/test/system/sigusr_trap
|
||||
|
@ -117,9 +117,10 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *buffer,
|
||||
void *nptr;
|
||||
uint32_t key;
|
||||
char *nidmap;
|
||||
orte_proc_t *dmn;
|
||||
orte_proc_t *dmn, *proc;
|
||||
opal_value_t *val = NULL, *kv;
|
||||
opal_list_t *modex;
|
||||
int n;
|
||||
|
||||
/* get the job data pointer */
|
||||
if (NULL == (jdata = orte_get_job_data_object(job))) {
|
||||
@ -282,6 +283,17 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *buffer,
|
||||
OBJ_DESTRUCT(&jobdata);
|
||||
return rc;
|
||||
}
|
||||
/* pack the location of each proc */
|
||||
for (n=0; n < jptr->procs->size; n++) {
|
||||
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jptr->procs, n))) {
|
||||
continue;
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(&jobdata, &proc->parent, 1, ORTE_VPID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&jobdata);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
++numjobs;
|
||||
}
|
||||
rc = opal_hash_table_get_next_key_uint32(orte_job_data, &key, (void **)&jptr, nptr, &nptr);
|
||||
@ -355,6 +367,7 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *buffer,
|
||||
orte_std_cntr_t cnt;
|
||||
orte_job_t *jdata=NULL, *daemons;
|
||||
orte_node_t *node;
|
||||
orte_vpid_t dmnvpid, v;
|
||||
int32_t n, k;
|
||||
opal_buffer_t *bptr;
|
||||
orte_proc_t *pptr, *dmn;
|
||||
@ -411,6 +424,31 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *buffer,
|
||||
/* yep - so we can drop this copy */
|
||||
jdata->jobid = ORTE_JOBID_INVALID;
|
||||
OBJ_RELEASE(jdata);
|
||||
continue;
|
||||
}
|
||||
/* unpack the location of each proc in this job */
|
||||
for (v=0; v < jdata->num_procs; v++) {
|
||||
if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, v))) {
|
||||
pptr = OBJ_NEW(orte_proc_t);
|
||||
pptr->name.jobid = jdata->jobid;
|
||||
pptr->name.vpid = v;
|
||||
opal_pointer_array_set_item(jdata->procs, v, pptr);
|
||||
}
|
||||
cnt=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(bptr, &dmnvpid, &cnt, ORTE_VPID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(jdata);
|
||||
goto REPORT_ERROR;
|
||||
}
|
||||
/* lookup the daemon */
|
||||
if (NULL == (dmn = (orte_proc_t*)opal_pointer_array_get_item(daemons->procs, dmnvpid))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
rc = ORTE_ERR_NOT_FOUND;
|
||||
goto REPORT_ERROR;
|
||||
}
|
||||
/* connect the two */
|
||||
OBJ_RETAIN(dmn->node);
|
||||
pptr->node = dmn->node;
|
||||
}
|
||||
}
|
||||
/* release the buffer */
|
||||
|
@ -173,6 +173,8 @@ void orte_plm_base_daemons_reported(int fd, short args, void *cbdata)
|
||||
if (orte_display_allocation) {
|
||||
orte_ras_base_display_alloc();
|
||||
}
|
||||
/* ensure we update the routing plan */
|
||||
orte_routed.update_routing_plan(NULL);
|
||||
|
||||
/* progress the job */
|
||||
caddy->jdata->state = ORTE_JOB_STATE_DAEMONS_REPORTED;
|
||||
@ -1346,8 +1348,9 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
|
||||
} else {
|
||||
jdatorted->num_reported++;
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
|
||||
"%s plm:base:orted_report_launch recvd %d of %d reported daemons",
|
||||
"%s plm:base:orted_report_launch job %s recvd %d of %d reported daemons",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(jdatorted->jobid),
|
||||
jdatorted->num_reported, jdatorted->num_procs));
|
||||
if (jdatorted->num_procs == jdatorted->num_reported) {
|
||||
bool dvm = true;
|
||||
|
@ -5,7 +5,7 @@ PROGS = mpi_no_op mpi_barrier hello hello_nodename abort multi_abort simple_spaw
|
||||
parallel_w8 parallel_w64 parallel_r8 parallel_r64 sio sendrecv_blaster early_abort \
|
||||
debugger singleton_client_server intercomm_create spawn_tree init-exit77 mpi_info \
|
||||
info_spawn server client paccept pconnect ring hello.sapp binding badcoll attach xlib \
|
||||
no-disconnect nonzero interlib pinterlib
|
||||
no-disconnect nonzero interlib pinterlib add_host
|
||||
|
||||
all: $(PROGS)
|
||||
|
||||
|
68
orte/test/mpi/add_host.c
Обычный файл
68
orte/test/mpi/add_host.c
Обычный файл
@ -0,0 +1,68 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <sys/types.h>
|
||||
#include <unistd.h>
|
||||
#include <sys/param.h>
|
||||
|
||||
#include <mpi.h>
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
int msg, rc;
|
||||
MPI_Comm parent, child;
|
||||
int rank, size;
|
||||
char hostname[MAXHOSTNAMELEN];
|
||||
pid_t pid;
|
||||
char *env_rank,*env_nspace;
|
||||
MPI_Info info;
|
||||
|
||||
env_rank = getenv("PMIX_RANK");
|
||||
env_nspace = getenv("PMIX_NAMESPACE");
|
||||
pid = getpid();
|
||||
gethostname(hostname, sizeof(hostname));
|
||||
|
||||
printf("[%s:%s pid %ld] starting up on node %s!\n", env_nspace, env_rank, (long)pid, hostname);
|
||||
|
||||
MPI_Init(NULL, NULL);
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
|
||||
printf("%d completed MPI_Init\n", rank);
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &size);
|
||||
MPI_Comm_get_parent(&parent);
|
||||
/* If we get COMM_NULL back, then we're the parent */
|
||||
if (MPI_COMM_NULL == parent) {
|
||||
pid = getpid();
|
||||
printf("Parent [pid %ld] about to spawn!\n", (long)pid);
|
||||
MPI_Info_create(&info);
|
||||
MPI_Info_set(info, "add-host", "rhc002:24");
|
||||
if (MPI_SUCCESS != (rc = MPI_Comm_spawn(argv[0], MPI_ARGV_NULL, 3, info,
|
||||
0, MPI_COMM_WORLD, &child, MPI_ERRCODES_IGNORE))) {
|
||||
printf("Child failed to spawn\n");
|
||||
return rc;
|
||||
}
|
||||
printf("Parent done with spawn\n");
|
||||
if (0 == rank) {
|
||||
msg = 38;
|
||||
printf("Parent sending message to child\n");
|
||||
MPI_Send(&msg, 1, MPI_INT, 0, 1, child);
|
||||
}
|
||||
MPI_Comm_disconnect(&child);
|
||||
printf("Parent disconnected\n");
|
||||
}
|
||||
/* Otherwise, we're the child */
|
||||
else {
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &size);
|
||||
pid = getpid();
|
||||
printf("Hello from the child %d of %d on host %s pid %ld\n", rank, 3, hostname, (long)pid);
|
||||
if (0 == rank) {
|
||||
MPI_Recv(&msg, 1, MPI_INT, 0, 1, parent, MPI_STATUS_IGNORE);
|
||||
printf("Child %d received msg: %d\n", rank, msg);
|
||||
}
|
||||
MPI_Comm_disconnect(&parent);
|
||||
printf("Child %d disconnected\n", rank);
|
||||
}
|
||||
|
||||
MPI_Finalize();
|
||||
fprintf(stderr, "%d: exiting\n", pid);
|
||||
return 0;
|
||||
}
|
Загрузка…
x
Ссылка в новой задаче
Block a user