Add a few attributes that are helpful for folks doing things like Eclipse. Also add yet another command-line option to orterun to support one of the new attributes. These include:
1. ORTE_RMAPS_DISPLAY_AT_LAUNCH: pretty-prints out the process map right before we launch so you can see where everyone is going. This is settable via the command line option "--display-map-at-launch" 2. ORTE_RMGR_STOP_AFTER_SETUP: just setup the job and then return from the spawn command. 3. ORTE_RMGR_STOP_AFTER_ALLOC: return from the rmgr.spawn call after allocating the job 4. ORTE_RMGR_STOP_AFTER_MAP: return from the rmgr.spawn call after mapping the job. This gives folks a chance to retrieve and graphically display the map, let the user edit it, and store the results. They can then call "launch" on their own and the system will use the revised map. Enjoy! My personal favorite is the first one - helps with debugging. This commit was SVN r12379.
Этот коммит содержится в:
родитель
17c71f8d2a
Коммит
30de73a712
@ -45,89 +45,12 @@ int orte_ras_base_allocate(orte_jobid_t jobid, opal_list_t *attributes)
|
||||
orte_jobid_t * jptr;
|
||||
|
||||
/* so there are a lot of possibilities here */
|
||||
/* 1: we are not on the head node, so use the proxy component */
|
||||
/* Case 1: we are not on the head node, so use the proxy component */
|
||||
if (!orte_process_info.seed) {
|
||||
return orte_ras_base_proxy_allocate(jobid, attributes);
|
||||
}
|
||||
|
||||
/* 2: either no attributes were passed, or ORTE_RAS_INITIAL_ALLOCATION
|
||||
* was passed. This means that if the node segment is empty, we
|
||||
* want to allocate new nodes. Otherwise allocate all the existing nodes to
|
||||
* our job */
|
||||
if(NULL == attributes || opal_list_is_empty(attributes) ||
|
||||
NULL != (attr = orte_rmgr.find_attribute(attributes, ORTE_RAS_INITIAL_ALLOCATION))) {
|
||||
OBJ_CONSTRUCT(&nodes, opal_list_t);
|
||||
/* See if there are any nodes already on the registry. Most of the time
|
||||
* these would have been put there by the RDS reading the hostfile. */
|
||||
if (ORTE_SUCCESS != (ret = orte_ras_base_node_query(&nodes))) {
|
||||
OBJ_DESTRUCT(&nodes);
|
||||
return ret;
|
||||
}
|
||||
/* If there are any nodes at all, allocate them all to this job */
|
||||
if (!opal_list_is_empty(&nodes)) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"orte:ras:base:allocate: reallocating nodes that are already on registry");
|
||||
ret = orte_ras_base_allocate_nodes(jobid, &nodes);
|
||||
OBJ_DESTRUCT(&nodes);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* there were no nodes already on the registry, so get them from the
|
||||
* RAS components */
|
||||
|
||||
/* If no components are available, then return an error */
|
||||
if (opal_list_is_empty(&orte_ras_base.ras_available)) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"orte:ras:base:allocate: no components available!");
|
||||
ret = ORTE_ERR_NOT_FOUND;
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Otherwise, go through the [already sorted in priority order]
|
||||
* list and initialize them until one of them puts something on
|
||||
* the node segment */
|
||||
for (item = opal_list_get_first(&orte_ras_base.ras_available);
|
||||
item != opal_list_get_end(&orte_ras_base.ras_available);
|
||||
item = opal_list_get_next(item)) {
|
||||
cmp = (orte_ras_base_cmp_t *) item;
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"orte:ras:base:allocate: attemping to allocate using module: %s",
|
||||
cmp->component->ras_version.mca_component_name);
|
||||
|
||||
if (NULL != cmp->module->allocate_job) {
|
||||
ret = cmp->module->allocate_job(jobid, attributes);
|
||||
if (ORTE_SUCCESS == ret) {
|
||||
bool empty;
|
||||
|
||||
if (ORTE_SUCCESS !=
|
||||
(ret = orte_ras_base_node_segment_empty(&empty))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* If this module put something on the node segment,
|
||||
we're done */
|
||||
|
||||
if (!empty) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"orte:ras:base:allocate: found good module: %s",
|
||||
cmp->component->ras_version.mca_component_name);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* We didn't find anyone who put anything on the node segment */
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"orte:ras:base:allocate: no module put anything in the node segment");
|
||||
ret = ORTE_ERR_NOT_FOUND;
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Case 3: We want to use our parent's allocation. This can occur if we
|
||||
/* Case 2: We want to use our parent's allocation. This can occur if we
|
||||
* are doing a dynamic process spawn and don't want to do go through
|
||||
* the allocators again. */
|
||||
if (NULL != (attr = orte_rmgr.find_attribute(attributes, ORTE_RAS_USE_PARENT_ALLOCATION))) {
|
||||
@ -144,8 +67,8 @@ int orte_ras_base_allocate(orte_jobid_t jobid, opal_list_t *attributes)
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* Case 4: We want to use a new allocation. This can happen if we
|
||||
|
||||
/* Case 3: We want to get a new allocation. This can happen if we
|
||||
* are spawning a new process that does not want to use its parent's
|
||||
* allocation. */
|
||||
if (NULL != (attr = orte_rmgr.find_attribute(attributes, ORTE_RAS_USE_NEW_ALLOCATION))) {
|
||||
@ -157,10 +80,10 @@ int orte_ras_base_allocate(orte_jobid_t jobid, opal_list_t *attributes)
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
/* Otherwise, go through the [already sorted in priority order]
|
||||
* list and initialize them until one of them puts something on
|
||||
* the node segment */
|
||||
* list and call them until one of them puts something on
|
||||
* the node segment */
|
||||
for (item = opal_list_get_first(&orte_ras_base.ras_available);
|
||||
item != opal_list_get_end(&orte_ras_base.ras_available);
|
||||
item = opal_list_get_next(item)) {
|
||||
@ -168,21 +91,21 @@ int orte_ras_base_allocate(orte_jobid_t jobid, opal_list_t *attributes)
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"orte:ras:base:allocate: attemping to allocate using module: %s",
|
||||
cmp->component->ras_version.mca_component_name);
|
||||
|
||||
|
||||
if (NULL != cmp->module->allocate_job) {
|
||||
ret = cmp->module->allocate_job(jobid, attributes);
|
||||
if (ORTE_SUCCESS == ret) {
|
||||
bool empty;
|
||||
|
||||
|
||||
if (ORTE_SUCCESS !=
|
||||
(ret = orte_ras_base_node_segment_empty(&empty))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
/* If this module put something on the node segment,
|
||||
we're done */
|
||||
|
||||
we're done */
|
||||
|
||||
if (!empty) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"orte:ras:base:allocate: found good module: %s",
|
||||
@ -192,7 +115,7 @@ int orte_ras_base_allocate(orte_jobid_t jobid, opal_list_t *attributes)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* We didn't find anyone who put anything on the node segment */
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"orte:ras:base:allocate: no module put anything in the node segment");
|
||||
@ -200,8 +123,76 @@ int orte_ras_base_allocate(orte_jobid_t jobid, opal_list_t *attributes)
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Case 4: no RAS-specific directive was passed. This means that if the node segment is empty, we
|
||||
* want to allocate new nodes. Otherwise allocate all the existing nodes to
|
||||
* our job */
|
||||
OBJ_CONSTRUCT(&nodes, opal_list_t);
|
||||
/* See if there are any nodes already on the registry. Most of the time
|
||||
* these would have been put there by the RDS reading the hostfile. */
|
||||
if (ORTE_SUCCESS != (ret = orte_ras_base_node_query(&nodes))) {
|
||||
OBJ_DESTRUCT(&nodes);
|
||||
return ret;
|
||||
}
|
||||
/* If there are any nodes at all, allocate them all to this job */
|
||||
if (!opal_list_is_empty(&nodes)) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"orte:ras:base:allocate: reallocating nodes that are already on registry");
|
||||
ret = orte_ras_base_allocate_nodes(jobid, &nodes);
|
||||
OBJ_DESTRUCT(&nodes);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* none of the above cases fit. This is not a good thing... */
|
||||
/* there were no nodes already on the registry, so get them from the
|
||||
* RAS components */
|
||||
|
||||
/* If no components are available, then return an error */
|
||||
if (opal_list_is_empty(&orte_ras_base.ras_available)) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"orte:ras:base:allocate: no components available!");
|
||||
ret = ORTE_ERR_NOT_FOUND;
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Otherwise, go through the [already sorted in priority order]
|
||||
* list and initialize them until one of them puts something on
|
||||
* the node segment */
|
||||
for (item = opal_list_get_first(&orte_ras_base.ras_available);
|
||||
item != opal_list_get_end(&orte_ras_base.ras_available);
|
||||
item = opal_list_get_next(item)) {
|
||||
cmp = (orte_ras_base_cmp_t *) item;
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"orte:ras:base:allocate: attemping to allocate using module: %s",
|
||||
cmp->component->ras_version.mca_component_name);
|
||||
|
||||
if (NULL != cmp->module->allocate_job) {
|
||||
ret = cmp->module->allocate_job(jobid, attributes);
|
||||
if (ORTE_SUCCESS == ret) {
|
||||
bool empty;
|
||||
|
||||
if (ORTE_SUCCESS !=
|
||||
(ret = orte_ras_base_node_segment_empty(&empty))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* If this module put something on the node segment,
|
||||
we're done */
|
||||
|
||||
if (!empty) {
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"orte:ras:base:allocate: found good module: %s",
|
||||
cmp->component->ras_version.mca_component_name);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* We didn't find anyone who put anything on the node segment */
|
||||
opal_output(orte_ras_base.ras_output,
|
||||
"orte:ras:base:allocate: no module put anything in the node segment");
|
||||
ret = ORTE_ERR_NOT_FOUND;
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
|
@ -41,6 +41,7 @@ extern "C" {
|
||||
#define ORTE_RMAPS_DESIRED_MAPPER "orte-map-desired"
|
||||
#define ORTE_RMAPS_USE_PARENT_PLAN "orte-map-use-parent-plan"
|
||||
#define ORTE_RMAPS_BOOKMARK "orte-map-bookmark"
|
||||
#define ORTE_RMAPS_DISPLAY_AFTER_MAP "orte-map-display"
|
||||
|
||||
/**** JOB_MAP OBJECTS ***/
|
||||
/*
|
||||
|
@ -46,6 +46,11 @@ ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_attribute_t);
|
||||
#define ORTE_RMGR_ATTR_NO_OVERRIDE false
|
||||
#define ORTE_RMGR_ATTR_OVERRIDE true
|
||||
|
||||
/* define some useful attributes */
|
||||
#define ORTE_RMGR_STOP_AFTER_SETUP "orte-rmgr-stop-setup"
|
||||
#define ORTE_RMGR_STOP_AFTER_ALLOC "orte-rmgr-stop-alloc"
|
||||
#define ORTE_RMGR_STOP_AFTER_MAP "orte-rmgr-stop-map"
|
||||
|
||||
|
||||
/* RESOURCE MANAGER DATA TYPES */
|
||||
|
||||
|
@ -294,6 +294,7 @@ static int orte_rmgr_urm_spawn_job(
|
||||
int rc;
|
||||
orte_process_name_t* name;
|
||||
struct timeval urmstart, urmstop;
|
||||
orte_job_map_t *map;
|
||||
|
||||
OPAL_TRACE(1);
|
||||
|
||||
@ -329,17 +330,34 @@ static int orte_rmgr_urm_spawn_job(
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
|
||||
if (NULL != orte_rmgr.find_attribute(attributes, ORTE_RMGR_STOP_AFTER_SETUP)) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_ras.allocate_job(*jobid, attributes))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps.map_job(*jobid, attributes))) {
|
||||
if (NULL != orte_rmgr.find_attribute(attributes, ORTE_RMGR_STOP_AFTER_ALLOC)) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps.map_job(*jobid, attributes))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (NULL != orte_rmgr.find_attribute(attributes, ORTE_RMAPS_DISPLAY_AFTER_MAP)) {
|
||||
orte_rmaps.get_job_map(&map, *jobid);
|
||||
orte_dss.dump(0, map, ORTE_JOB_MAP);
|
||||
}
|
||||
|
||||
if (NULL != orte_rmgr.find_attribute(attributes, ORTE_RMGR_STOP_AFTER_MAP)) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* setup I/O forwarding
|
||||
*/
|
||||
|
@ -79,6 +79,12 @@ that was returned: %d.
|
||||
%s was unable to set
|
||||
%s = %s
|
||||
in the environment. Returned value %d instead of ORTE_SUCCESS.
|
||||
[orterun:precondition]
|
||||
%s was unable to precondition transports
|
||||
Returned value %d instead of ORTE_SUCCESS.
|
||||
[orterun:attr-failed]
|
||||
%s was unable to define an attribute
|
||||
Returned value %d instead of ORTE_SUCCESS.
|
||||
[orterun:proc-aborted]
|
||||
%s noticed that job rank %lu with PID %lu on node %s exited on signal %d.
|
||||
[orterun:abnormal-exit]
|
||||
|
@ -64,6 +64,7 @@
|
||||
#include "orte/mca/ns/ns.h"
|
||||
#include "orte/mca/gpr/gpr.h"
|
||||
#include "orte/mca/pls/pls.h"
|
||||
#include "orte/mca/rmaps/rmaps_types.h"
|
||||
#include "orte/mca/rmgr/rmgr.h"
|
||||
#include "orte/mca/schema/schema.h"
|
||||
#include "orte/mca/smr/smr.h"
|
||||
@ -112,6 +113,7 @@ struct globals_t {
|
||||
bool no_oversubscribe;
|
||||
bool debugger;
|
||||
bool no_local_schedule;
|
||||
bool displaymapatlaunch;
|
||||
int num_procs;
|
||||
int exit_status;
|
||||
char *hostfile;
|
||||
@ -197,7 +199,10 @@ opal_cmd_line_init_t cmd_line_init[] = {
|
||||
{ NULL, NULL, NULL, '\0', "nooversubscribe", "nooversubscribe", 0,
|
||||
&orterun_globals.no_oversubscribe, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Nodes are not to be oversubscribed, even if the system supports such operation"},
|
||||
|
||||
{ NULL, NULL, NULL, '\0', "display-map-at-launch", "display-map-at-launch", 0,
|
||||
&orterun_globals.displaymapatlaunch, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Display the process map just before launch"},
|
||||
|
||||
/* mpiexec-like arguments */
|
||||
{ NULL, NULL, NULL, '\0', "wdir", "wdir", 1,
|
||||
&orterun_globals.wdir, OPAL_CMD_LINE_TYPE_STRING,
|
||||
@ -422,6 +427,14 @@ int orterun(int argc, char *argv[])
|
||||
/* construct the list of attributes */
|
||||
OBJ_CONSTRUCT(&attributes, opal_list_t);
|
||||
|
||||
if (orterun_globals.displaymapatlaunch) {
|
||||
if (ORTE_SUCCESS != (rc = orte_rmgr.add_attribute(&attributes, ORTE_RMAPS_DISPLAY_AFTER_MAP,
|
||||
ORTE_UNDEF, NULL, ORTE_RMGR_ATTR_OVERRIDE))) {
|
||||
opal_show_help("help-orterun.txt", "orterun:attr-failed", false,
|
||||
orterun_basename, NULL, NULL, rc);
|
||||
}
|
||||
}
|
||||
|
||||
/** setup callbacks for abort signals */
|
||||
opal_signal_set(&term_handler, SIGTERM,
|
||||
abort_signal_callback, &term_handler);
|
||||
@ -811,6 +824,7 @@ static int init_globals(void)
|
||||
orterun_globals.no_oversubscribe = false;
|
||||
orterun_globals.debugger = false;
|
||||
orterun_globals.no_local_schedule = false;
|
||||
orterun_globals.displaymapatlaunch = false;
|
||||
orterun_globals.num_procs = 0;
|
||||
orterun_globals.exit_status = 0;
|
||||
if( NULL == orterun_globals.hostfile )
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user