Fix add-host support by including the location for procs of prior jobs when spawning new daemons.
Thanks to CalugaruVaxile for the report Signed-off-by: Ralph Castain <rhc@open-mpi.org>
Этот коммит содержится в:
родитель
0d1c58853b
Коммит
4316213805
1
.gitignore
поставляемый
1
.gitignore
поставляемый
@ -435,6 +435,7 @@ orte/test/mpi/badcoll
|
|||||||
orte/test/mpi/iof
|
orte/test/mpi/iof
|
||||||
orte/test/mpi/no-disconnect
|
orte/test/mpi/no-disconnect
|
||||||
orte/test/mpi/nonzero
|
orte/test/mpi/nonzero
|
||||||
|
orte/test/mpi/add_host
|
||||||
|
|
||||||
orte/test/system/radix
|
orte/test/system/radix
|
||||||
orte/test/system/sigusr_trap
|
orte/test/system/sigusr_trap
|
||||||
|
@ -117,9 +117,10 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *buffer,
|
|||||||
void *nptr;
|
void *nptr;
|
||||||
uint32_t key;
|
uint32_t key;
|
||||||
char *nidmap;
|
char *nidmap;
|
||||||
orte_proc_t *dmn;
|
orte_proc_t *dmn, *proc;
|
||||||
opal_value_t *val = NULL, *kv;
|
opal_value_t *val = NULL, *kv;
|
||||||
opal_list_t *modex;
|
opal_list_t *modex;
|
||||||
|
int n;
|
||||||
|
|
||||||
/* get the job data pointer */
|
/* get the job data pointer */
|
||||||
if (NULL == (jdata = orte_get_job_data_object(job))) {
|
if (NULL == (jdata = orte_get_job_data_object(job))) {
|
||||||
@ -282,6 +283,17 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *buffer,
|
|||||||
OBJ_DESTRUCT(&jobdata);
|
OBJ_DESTRUCT(&jobdata);
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
/* pack the location of each proc */
|
||||||
|
for (n=0; n < jptr->procs->size; n++) {
|
||||||
|
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jptr->procs, n))) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(&jobdata, &proc->parent, 1, ORTE_VPID))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
OBJ_DESTRUCT(&jobdata);
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
}
|
||||||
++numjobs;
|
++numjobs;
|
||||||
}
|
}
|
||||||
rc = opal_hash_table_get_next_key_uint32(orte_job_data, &key, (void **)&jptr, nptr, &nptr);
|
rc = opal_hash_table_get_next_key_uint32(orte_job_data, &key, (void **)&jptr, nptr, &nptr);
|
||||||
@ -355,6 +367,7 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *buffer,
|
|||||||
orte_std_cntr_t cnt;
|
orte_std_cntr_t cnt;
|
||||||
orte_job_t *jdata=NULL, *daemons;
|
orte_job_t *jdata=NULL, *daemons;
|
||||||
orte_node_t *node;
|
orte_node_t *node;
|
||||||
|
orte_vpid_t dmnvpid, v;
|
||||||
int32_t n, k;
|
int32_t n, k;
|
||||||
opal_buffer_t *bptr;
|
opal_buffer_t *bptr;
|
||||||
orte_proc_t *pptr, *dmn;
|
orte_proc_t *pptr, *dmn;
|
||||||
@ -411,6 +424,31 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *buffer,
|
|||||||
/* yep - so we can drop this copy */
|
/* yep - so we can drop this copy */
|
||||||
jdata->jobid = ORTE_JOBID_INVALID;
|
jdata->jobid = ORTE_JOBID_INVALID;
|
||||||
OBJ_RELEASE(jdata);
|
OBJ_RELEASE(jdata);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
/* unpack the location of each proc in this job */
|
||||||
|
for (v=0; v < jdata->num_procs; v++) {
|
||||||
|
if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, v))) {
|
||||||
|
pptr = OBJ_NEW(orte_proc_t);
|
||||||
|
pptr->name.jobid = jdata->jobid;
|
||||||
|
pptr->name.vpid = v;
|
||||||
|
opal_pointer_array_set_item(jdata->procs, v, pptr);
|
||||||
|
}
|
||||||
|
cnt=1;
|
||||||
|
if (ORTE_SUCCESS != (rc = opal_dss.unpack(bptr, &dmnvpid, &cnt, ORTE_VPID))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
OBJ_RELEASE(jdata);
|
||||||
|
goto REPORT_ERROR;
|
||||||
|
}
|
||||||
|
/* lookup the daemon */
|
||||||
|
if (NULL == (dmn = (orte_proc_t*)opal_pointer_array_get_item(daemons->procs, dmnvpid))) {
|
||||||
|
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||||
|
rc = ORTE_ERR_NOT_FOUND;
|
||||||
|
goto REPORT_ERROR;
|
||||||
|
}
|
||||||
|
/* connect the two */
|
||||||
|
OBJ_RETAIN(dmn->node);
|
||||||
|
pptr->node = dmn->node;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/* release the buffer */
|
/* release the buffer */
|
||||||
|
@ -173,6 +173,8 @@ void orte_plm_base_daemons_reported(int fd, short args, void *cbdata)
|
|||||||
if (orte_display_allocation) {
|
if (orte_display_allocation) {
|
||||||
orte_ras_base_display_alloc();
|
orte_ras_base_display_alloc();
|
||||||
}
|
}
|
||||||
|
/* ensure we update the routing plan */
|
||||||
|
orte_routed.update_routing_plan(NULL);
|
||||||
|
|
||||||
/* progress the job */
|
/* progress the job */
|
||||||
caddy->jdata->state = ORTE_JOB_STATE_DAEMONS_REPORTED;
|
caddy->jdata->state = ORTE_JOB_STATE_DAEMONS_REPORTED;
|
||||||
@ -1346,8 +1348,9 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
|
|||||||
} else {
|
} else {
|
||||||
jdatorted->num_reported++;
|
jdatorted->num_reported++;
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
|
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
|
||||||
"%s plm:base:orted_report_launch recvd %d of %d reported daemons",
|
"%s plm:base:orted_report_launch job %s recvd %d of %d reported daemons",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
ORTE_JOBID_PRINT(jdatorted->jobid),
|
||||||
jdatorted->num_reported, jdatorted->num_procs));
|
jdatorted->num_reported, jdatorted->num_procs));
|
||||||
if (jdatorted->num_procs == jdatorted->num_reported) {
|
if (jdatorted->num_procs == jdatorted->num_reported) {
|
||||||
bool dvm = true;
|
bool dvm = true;
|
||||||
|
@ -5,7 +5,7 @@ PROGS = mpi_no_op mpi_barrier hello hello_nodename abort multi_abort simple_spaw
|
|||||||
parallel_w8 parallel_w64 parallel_r8 parallel_r64 sio sendrecv_blaster early_abort \
|
parallel_w8 parallel_w64 parallel_r8 parallel_r64 sio sendrecv_blaster early_abort \
|
||||||
debugger singleton_client_server intercomm_create spawn_tree init-exit77 mpi_info \
|
debugger singleton_client_server intercomm_create spawn_tree init-exit77 mpi_info \
|
||||||
info_spawn server client paccept pconnect ring hello.sapp binding badcoll attach xlib \
|
info_spawn server client paccept pconnect ring hello.sapp binding badcoll attach xlib \
|
||||||
no-disconnect nonzero interlib pinterlib
|
no-disconnect nonzero interlib pinterlib add_host
|
||||||
|
|
||||||
all: $(PROGS)
|
all: $(PROGS)
|
||||||
|
|
||||||
|
68
orte/test/mpi/add_host.c
Обычный файл
68
orte/test/mpi/add_host.c
Обычный файл
@ -0,0 +1,68 @@
|
|||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <sys/types.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
#include <sys/param.h>
|
||||||
|
|
||||||
|
#include <mpi.h>
|
||||||
|
|
||||||
|
int main(int argc, char* argv[])
|
||||||
|
{
|
||||||
|
int msg, rc;
|
||||||
|
MPI_Comm parent, child;
|
||||||
|
int rank, size;
|
||||||
|
char hostname[MAXHOSTNAMELEN];
|
||||||
|
pid_t pid;
|
||||||
|
char *env_rank,*env_nspace;
|
||||||
|
MPI_Info info;
|
||||||
|
|
||||||
|
env_rank = getenv("PMIX_RANK");
|
||||||
|
env_nspace = getenv("PMIX_NAMESPACE");
|
||||||
|
pid = getpid();
|
||||||
|
gethostname(hostname, sizeof(hostname));
|
||||||
|
|
||||||
|
printf("[%s:%s pid %ld] starting up on node %s!\n", env_nspace, env_rank, (long)pid, hostname);
|
||||||
|
|
||||||
|
MPI_Init(NULL, NULL);
|
||||||
|
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
|
||||||
|
printf("%d completed MPI_Init\n", rank);
|
||||||
|
MPI_Comm_size(MPI_COMM_WORLD, &size);
|
||||||
|
MPI_Comm_get_parent(&parent);
|
||||||
|
/* If we get COMM_NULL back, then we're the parent */
|
||||||
|
if (MPI_COMM_NULL == parent) {
|
||||||
|
pid = getpid();
|
||||||
|
printf("Parent [pid %ld] about to spawn!\n", (long)pid);
|
||||||
|
MPI_Info_create(&info);
|
||||||
|
MPI_Info_set(info, "add-host", "rhc002:24");
|
||||||
|
if (MPI_SUCCESS != (rc = MPI_Comm_spawn(argv[0], MPI_ARGV_NULL, 3, info,
|
||||||
|
0, MPI_COMM_WORLD, &child, MPI_ERRCODES_IGNORE))) {
|
||||||
|
printf("Child failed to spawn\n");
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
printf("Parent done with spawn\n");
|
||||||
|
if (0 == rank) {
|
||||||
|
msg = 38;
|
||||||
|
printf("Parent sending message to child\n");
|
||||||
|
MPI_Send(&msg, 1, MPI_INT, 0, 1, child);
|
||||||
|
}
|
||||||
|
MPI_Comm_disconnect(&child);
|
||||||
|
printf("Parent disconnected\n");
|
||||||
|
}
|
||||||
|
/* Otherwise, we're the child */
|
||||||
|
else {
|
||||||
|
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
|
||||||
|
MPI_Comm_size(MPI_COMM_WORLD, &size);
|
||||||
|
pid = getpid();
|
||||||
|
printf("Hello from the child %d of %d on host %s pid %ld\n", rank, 3, hostname, (long)pid);
|
||||||
|
if (0 == rank) {
|
||||||
|
MPI_Recv(&msg, 1, MPI_INT, 0, 1, parent, MPI_STATUS_IGNORE);
|
||||||
|
printf("Child %d received msg: %d\n", rank, msg);
|
||||||
|
}
|
||||||
|
MPI_Comm_disconnect(&parent);
|
||||||
|
printf("Child %d disconnected\n", rank);
|
||||||
|
}
|
||||||
|
|
||||||
|
MPI_Finalize();
|
||||||
|
fprintf(stderr, "%d: exiting\n", pid);
|
||||||
|
return 0;
|
||||||
|
}
|
Загрузка…
x
Ссылка в новой задаче
Block a user