Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things).
Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it.
I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn).
This commit was SVN r12597.
2006-11-14 19:34:59 +00:00
|
|
|
/*
|
|
|
|
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
|
|
|
* University Research and Technology
|
|
|
|
* Corporation. All rights reserved.
|
|
|
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
|
|
|
* of Tennessee Research Foundation. All rights
|
|
|
|
* reserved.
|
|
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
|
|
* University of Stuttgart. All rights reserved.
|
|
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
|
|
* All rights reserved.
|
|
|
|
* $COPYRIGHT$
|
|
|
|
*
|
|
|
|
* Additional copyrights may follow
|
|
|
|
*
|
|
|
|
* $HEADER$
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "orte_config.h"
|
|
|
|
#include "orte/orte_constants.h"
|
|
|
|
|
|
|
|
#include <stdio.h>
|
|
|
|
#include <string.h>
|
|
|
|
|
|
|
|
#include "orte/mca/errmgr/errmgr.h"
|
|
|
|
|
|
|
|
#include "orte/mca/ns/base/base.h"
|
|
|
|
|
|
|
|
int main(int argc, char **argv)
|
|
|
|
{
|
|
|
|
orte_process_name_t *test_name;
|
|
|
|
orte_cellid_t cell;
|
|
|
|
orte_jobid_t job;
|
|
|
|
orte_vpid_t vpid;
|
|
|
|
int i, j, rc;
|
|
|
|
char *tmp, *site, *resource;
|
|
|
|
|
|
|
|
if (ORTE_SUCCESS != orte_init(true)) {
|
|
|
|
fprintf(stderr, "failed to start ORTE\n");
|
|
|
|
exit (1);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* create a name */
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_ns.create_process_name(&test_name, 0, 1, 1))) { /* got error */
|
|
|
|
fprintf(stderr, "create process name failed with error %s\n",
|
|
|
|
ORTE_ERROR_NAME(rc));
|
|
|
|
exit(1);
|
|
|
|
} else {
|
|
|
|
fprintf(stderr, "got process name: %ld %ld %ld\n", ORTE_NAME_ARGS(test_name));
|
|
|
|
}
|
|
|
|
free(test_name);
|
|
|
|
|
|
|
|
/* convert a string to a name */
|
2006-12-07 03:11:20 +00:00
|
|
|
tmp = strdup("124.5678.0010");
|
Bring over the update to terminate orteds that are generated by a dynamic spawn such as comm_spawn. This introduces the concept of a job "family" - i.e., jobs that have a parent/child relationship. Comm_spawn'ed jobs have a parent (the one that spawned them). We track that relationship throughout the lineage - i.e., if a comm_spawned job in turn calls comm_spawn, then it has a parent (the one that spawned it) and a "root" job (the original job that started things).
Accordingly, there are new APIs to the name service to support the ability to get a job's parent, root, immediate children, and all its descendants. In addition, the terminate_job, terminate_orted, and signal_job APIs for the PLS have been modified to accept attributes that define the extent of their actions. For example, doing a "terminate_job" with an attribute of ORTE_NS_INCLUDE_DESCENDANTS will terminate the given jobid AND all jobs that descended from it.
I have tested this capability on a MacBook under rsh, Odin under SLURM, and LANL's Flash (bproc). It worked successfully on non-MPI jobs (both simple and including a spawn), and MPI jobs (again, both simple and with a spawn).
This commit was SVN r12597.
2006-11-14 19:34:59 +00:00
|
|
|
if (ORTE_SUCCESS != (rc = orte_ns.convert_string_to_process_name(&test_name, tmp))) { /* got error */
|
|
|
|
fprintf(stderr, "convert string to process name failed with error %s\n",
|
|
|
|
ORTE_ERROR_NAME(rc));
|
|
|
|
exit(1);
|
|
|
|
} else {
|
|
|
|
fprintf(stderr, "got process name: %ld %ld %ld\n", ORTE_NAME_ARGS(test_name));
|
|
|
|
}
|
|
|
|
free(tmp);
|
|
|
|
free(test_name);
|
|
|
|
|
|
|
|
/* create a cellid */
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_ns.create_cellid(&cell, "dummy-site", "dummy-resource"))) { /* got error */
|
|
|
|
fprintf(stderr, "create cellid: error with error %s\n", ORTE_ERROR_NAME(rc));
|
|
|
|
exit(1);
|
|
|
|
} else {
|
|
|
|
fprintf(stderr, "cellid created: %lu\n", (unsigned long) cell);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* get cellid info */
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_ns.get_cell_info(cell, &site, &resource))) { /* got error */
|
|
|
|
fprintf(stderr, "get_cell_info: error with error %s\n", ORTE_ERROR_NAME(rc));
|
|
|
|
exit(1);
|
|
|
|
} else {
|
|
|
|
fprintf(stderr, "get_cell_info: %lu %s %s\n", (unsigned long) cell, site, resource);
|
|
|
|
}
|
|
|
|
|
|
|
|
for (i=0; i<10; i++) { /* loop through */
|
|
|
|
/* create jobid */
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_ns.create_jobid(&job, NULL))) { /* got error */
|
|
|
|
fprintf(stderr, "create jobid: error with error %s\n", ORTE_ERROR_NAME(rc));
|
|
|
|
exit(1);
|
|
|
|
} else {
|
|
|
|
fprintf(stderr, "jobid created: %lu\n", (unsigned long) job);
|
|
|
|
}
|
|
|
|
|
|
|
|
for (j=0; j<5; j++) { /* loop through several vpid ranges */
|
|
|
|
/* get range of vpids */
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_ns.reserve_range(job, 250, &vpid))) { /* got error */
|
|
|
|
fprintf(stderr, "reserve range: error with error %s\n",
|
|
|
|
ORTE_ERROR_NAME(rc));
|
|
|
|
exit(1);
|
|
|
|
} else {
|
|
|
|
fprintf(stderr, "range reserved: %lu\n",
|
|
|
|
(unsigned long) vpid);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* create a name */
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_ns.create_process_name(&test_name, (orte_cellid_t)i,
|
|
|
|
job, vpid))) {
|
|
|
|
fprintf(stderr, "test_ns_replica: failed to create proc name after vpid range with error %s\n",
|
|
|
|
ORTE_ERROR_NAME(rc));
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* get and print its string values */
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_ns.get_proc_name_string(&tmp, test_name))) {
|
|
|
|
fprintf(stderr, "test_ns_replica: failed to get proc_name_string with error %s\n",
|
|
|
|
ORTE_ERROR_NAME(rc));
|
|
|
|
exit(1);
|
|
|
|
} else {
|
|
|
|
fprintf(stderr, "(%d) strings: name - %s\n", i, tmp);
|
|
|
|
}
|
|
|
|
free(tmp);
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_ns.get_vpid_string(&tmp, test_name))) {
|
|
|
|
fprintf(stderr, "test_ns_replica: failed to get vpid_string with error %s\n",
|
|
|
|
ORTE_ERROR_NAME(rc));
|
|
|
|
exit(1);
|
|
|
|
} else {
|
|
|
|
fprintf(stderr, "(%d) strings: vpid - %s\n", i, tmp);
|
|
|
|
}
|
|
|
|
free(tmp);
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_ns.get_jobid_string(&tmp, test_name))) {
|
|
|
|
fprintf(stderr, "test_ns_replica: failed to get jobid_string with error %s\n",
|
|
|
|
ORTE_ERROR_NAME(rc));
|
|
|
|
exit(1);
|
|
|
|
} else {
|
|
|
|
fprintf(stderr, "(%d) strings: jobid - %s\n", i, tmp);
|
|
|
|
}
|
|
|
|
free(tmp);
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_ns.get_cellid_string(&tmp, test_name))) {
|
|
|
|
fprintf(stderr, "test_ns_replica: failed to get cellid_string with error %s\n",
|
|
|
|
ORTE_ERROR_NAME(rc));
|
|
|
|
exit(1);
|
|
|
|
} else {
|
|
|
|
fprintf(stderr, "(%d) strings: cellid - %s\n", i, tmp);
|
|
|
|
}
|
|
|
|
free(tmp);
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* finalize and see if memory cleared */
|
|
|
|
orte_ns_base_close();
|
|
|
|
|
|
|
|
orte_proc_info_finalize();
|
|
|
|
mca_base_close();
|
|
|
|
opal_malloc_finalize();
|
|
|
|
opal_output_finalize();
|
|
|
|
|
|
|
|
fclose( stderr );
|
|
|
|
|
|
|
|
return(0);
|
|
|
|
}
|