1
1

Add support for the add-host and add-hostfile MPI Info keys to allow Comm_spawn users to add new hosts to those already known by mpirun.

Requires full testing once comm_spawn is fixed (Edgar is working that now).

This commit was SVN r21664.
Этот коммит содержится в:
Ralph Castain 2009-07-14 14:34:11 +00:00
родитель e37959c168
Коммит dbac602be5
14 изменённых файлов: 270 добавлений и 46 удалений

Просмотреть файл

@ -627,7 +627,13 @@ static int spawn(int count, char **array_of_commands,
app->add_hostfile = strdup(host);
}
/* 'path', 'arch', 'file', 'soft', 'add-host' -- to be implemented */
/* check for 'add-host' */
ompi_info_get (array_of_info[i], "add-host", sizeof(host) - 1, host, &flag);
if ( flag ) {
opal_argv_append_nosize(&app->add_host, host);
}
/* 'path', 'arch', 'file', 'soft' -- to be implemented */
/* check for 'ompi_prefix' (OMPI-specific -- to effect the same
* behavior as --prefix option to orterun)

Просмотреть файл

@ -108,6 +108,14 @@ hostfile char * Hostfile containing the hosts on which
the processes are to be spawned. See
the \fIorte_hostfile\fP man page for an
explanation of how this will be used.
add-host char * Add the specified host to the list of
hosts known to this job and use it
for the associated process. This will
be used similarly to the -host option.
add-hostfile char * Hostfile containing hosts to be added
to the list of hosts known to this job and
use it for the associated process. This will
be used similarly to the -hostfile option.
wdir char * Directory where the executable is located. If
files are to be pre-positioned, then this
location is the desired working directory

Просмотреть файл

@ -119,6 +119,14 @@ hostfile char * Hostfile containing the hosts on which
the processes are to be spawned. See
the \fIorte_hostfile\fP man page for an
explanation of how this will be used.
add-host char * Add the specified host to the list of
hosts known to this job and use it
for the associated processes. This will
be used similarly to the -host option.
add-hostfile char * Hostfile containing hosts to be added
to the list of hosts known to this job and
use it for the associated processes. This will
be used similarly to the -hostfile option.
wdir char * Directory where the executable is located. If
files are to be pre-positioned, then this
location is the desired working directory

Просмотреть файл

@ -43,6 +43,7 @@
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/rml_types.h"
#include "orte/mca/routed/routed.h"
#include "orte/mca/ras/base/base.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wait.h"
@ -164,7 +165,7 @@ void orte_plm_base_receive_process_msg(int fd, short event, void *data)
* orteds can find the correct binary. There always has to be at
* least one app_context in both parent and child, so we don't
* need to check that here. However, be sure not to overwrite
* the prefix if the user already provide it!
* the prefix if the user already provided it!
*/
app = (orte_app_context_t*)opal_pointer_array_get_item(parent->apps, 0);
child_app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, 0);
@ -173,6 +174,12 @@ void orte_plm_base_receive_process_msg(int fd, short event, void *data)
child_app->prefix_dir = strdup(app->prefix_dir);
}
/* process any add-hostfile and add-host options that were provided */
if (ORTE_SUCCESS != (rc = orte_ras_base_add_hosts(jdata))) {
ORTE_ERROR_LOG(rc);
goto ANSWER_LAUNCH;
}
/* find the sender's node in the job map */
if (NULL != (proc = (orte_proc_t*)opal_pointer_array_get_item(parent->procs, mev->sender.vpid))) {
/* set the bookmark so the child starts from that place - this means

Просмотреть файл

@ -57,6 +57,8 @@ ORTE_DECLSPEC int orte_ras_base_select(void);
ORTE_DECLSPEC int orte_ras_base_finalize(void);
ORTE_DECLSPEC int orte_ras_base_close(void);
ORTE_DECLSPEC int orte_ras_base_add_hosts(orte_job_t *jdata);
#endif /* ORTE_DISABLE_FULL_SUPPORT */
END_C_DECLS

Просмотреть файл

@ -42,6 +42,41 @@
#include "orte/mca/ras/base/ras_private.h"
/* static function to display allocation */
static void display_alloc(void)
{
char *tmp=NULL, *tmp2, *tmp3, *pfx=NULL;
int i;
orte_node_t *alloc;
if (orte_xml_output) {
asprintf(&tmp, "<allocation>\n");
pfx = "\t";
} else {
asprintf(&tmp, "\n====================== ALLOCATED NODES ======================\n");
}
for (i=0; i < orte_node_pool->size; i++) {
if (NULL == (alloc = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
continue;
}
opal_dss.print(&tmp2, pfx, alloc, ORTE_NODE);
if (NULL == tmp) {
tmp = tmp2;
} else {
asprintf(&tmp3, "%s%s", tmp, tmp2);
free(tmp);
free(tmp2);
tmp = tmp3;
}
}
if (orte_xml_output) {
opal_output(orte_clean_output, "%s</allocation>\n", tmp);
} else {
opal_output(orte_clean_output, "%s\n\n=================================================================\n", tmp);
}
free(tmp);
}
/*
* Function for selecting one component from all those that are
* available.
@ -50,10 +85,10 @@ int orte_ras_base_allocate(orte_job_t *jdata)
{
int rc;
opal_list_t nodes;
orte_node_t *node, **alloc;
orte_node_t *node;
orte_std_cntr_t i;
bool override_oversubscribed;
orte_app_context_t **apps;
orte_app_context_t *app;
OPAL_OUTPUT_VERBOSE((5, orte_ras_base.ras_output,
"%s ras:base:allocate",
@ -188,21 +223,20 @@ int orte_ras_base_allocate(orte_job_t *jdata)
* can be present
*/
/* convenience def */
apps = (orte_app_context_t**)jdata->apps->addr;
for (i=0; i < jdata->num_apps; i++) {
if (NULL != apps[i]->hostfile) {
for (i=0; i < jdata->apps->size; i++) {
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
continue;
}
if (NULL != app->hostfile) {
OPAL_OUTPUT_VERBOSE((5, orte_ras_base.ras_output,
"%s ras:base:allocate checking hostfile %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
apps[i]->hostfile));
app->hostfile));
/* hostfile was specified - parse it and add it to the list */
if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&nodes,
&override_oversubscribed,
apps[i]->hostfile))) {
app->hostfile))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&nodes);
return rc;
@ -249,11 +283,14 @@ int orte_ras_base_allocate(orte_job_t *jdata)
* generate an error in this scenario, so only non-relative syntax
* can be present
*/
for (i=0; i < jdata->num_apps; i++) {
if (NULL != apps[i]->dash_host) {
for (i=0; i < jdata->apps->size; i++) {
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
continue;
}
if (NULL != app->dash_host) {
if (ORTE_SUCCESS != (rc = orte_util_add_dash_host_nodes(&nodes,
&override_oversubscribed,
apps[i]->dash_host))) {
app->dash_host))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&nodes);
return rc;
@ -318,35 +355,99 @@ int orte_ras_base_allocate(orte_job_t *jdata)
DISPLAY:
/* shall we display the results? */
if (orte_ras_base.display_alloc) {
char *tmp=NULL, *tmp2, *tmp3, *pfx=NULL;
if (orte_xml_output) {
asprintf(&tmp, "<allocation>\n");
pfx = "\t";
} else {
asprintf(&tmp, "\n====================== ALLOCATED NODES ======================\n");
}
alloc = (orte_node_t**)orte_node_pool->addr;
for (i=0; i < orte_node_pool->size; i++) {
if (NULL == alloc[i]) {
break;
}
opal_dss.print(&tmp2, pfx, alloc[i], ORTE_NODE);
if (NULL == tmp) {
tmp = tmp2;
} else {
asprintf(&tmp3, "%s%s", tmp, tmp2);
free(tmp);
free(tmp2);
tmp = tmp3;
}
}
if (orte_xml_output) {
opal_output(orte_clean_output, "%s</allocation>\n", tmp);
} else {
opal_output(orte_clean_output, "%s\n\n=================================================================\n", tmp);
}
free(tmp);
display_alloc();
}
return rc;
}
int orte_ras_base_add_hosts(orte_job_t *jdata)
{
int rc;
opal_list_t nodes;
bool override_oversubscribed;
int i;
orte_app_context_t *app;
/* construct a list to hold the results */
OBJ_CONSTRUCT(&nodes, opal_list_t);
/* Individual add-hostfile names, if given, are included
* in the app_contexts for this job. We therefore need to
* retrieve the app_contexts for the job, and then cycle
* through them to see if anything is there. The parser will
* add the nodes found in each add-hostfile to our list - i.e.,
* the resulting list contains the UNION of all nodes specified
* in add-hostfiles from across all app_contexts
*
* Note that any relative node syntax found in the add-hostfiles will
* generate an error in this scenario, so only non-relative syntax
* can be present
*/
for (i=0; i < jdata->apps->size; i++) {
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
continue;
}
if (NULL != app->add_hostfile) {
OPAL_OUTPUT_VERBOSE((5, orte_ras_base.ras_output,
"%s ras:base:add_hosts checking add-hostfile %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
app->add_hostfile));
/* hostfile was specified - parse it and add it to the list */
if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&nodes,
&override_oversubscribed,
app->add_hostfile))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&nodes);
return rc;
}
}
}
/* We next check for and add any add-host options. Note this is
* a -little- different than dash-host in that (a) we add these
* nodes to the global pool regardless of what may already be there,
* and (b) as a result, any job and/or app_context can access them.
*
* Note that any relative node syntax found in the add-host lists will
* generate an error in this scenario, so only non-relative syntax
* can be present
*/
for (i=0; i < jdata->apps->size; i++) {
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
continue;
}
if (NULL != app->add_host) {
if (ORTE_SUCCESS != (rc = orte_util_add_dash_host_nodes(&nodes,
&override_oversubscribed,
app->add_host))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&nodes);
return rc;
}
}
}
/* if something was found, we add that to our global pool */
if (!opal_list_is_empty(&nodes)) {
/* store the results in the global resource pool - this removes the
* list items
*/
if (ORTE_SUCCESS != (rc = orte_ras_base_node_insert(&nodes, jdata))) {
ORTE_ERROR_LOG(rc);
}
/* update the jdata object with override_oversubscribed flag */
jdata->oversubscribe_override = override_oversubscribed;
/* cleanup */
OBJ_DESTRUCT(&nodes);
}
/* shall we display the results? */
if (orte_ras_base.display_alloc) {
display_alloc();
}
return ORTE_SUCCESS;
}

Просмотреть файл

@ -117,6 +117,25 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
}
/* did the app_context contain an add-hostfile? */
if (NULL != app->add_hostfile) {
/* yes - filter the node list through the file, removing
* any nodes not found in the file
*/
if (ORTE_SUCCESS != (rc = orte_util_filter_hostfile_nodes(allocated_nodes,
app->add_hostfile))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/** check that anything is here */
if (0 == opal_list_get_size(allocated_nodes)) {
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-mapped-node",
true, app->app, app->hostfile);
return ORTE_ERR_SILENT;
}
}
/* now filter the list through any -host specification */
if (NULL != app->dash_host) {
if (ORTE_SUCCESS != (rc = orte_util_filter_dash_host_nodes(allocated_nodes,
@ -132,6 +151,21 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
}
}
/* now filter the list through any add-host specification */
if (NULL != app->add_host) {
if (ORTE_SUCCESS != (rc = orte_util_filter_dash_host_nodes(allocated_nodes,
app->add_host))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/** check that anything is left! */
if (0 == opal_list_get_size(allocated_nodes)) {
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-mapped-node",
true, app->app, "");
return ORTE_ERR_SILENT;
}
}
/* If the "no local" option was set, then remove the local node
* from the list
*/

Просмотреть файл

@ -169,6 +169,8 @@ int orte_dt_copy_app_context(orte_app_context_t **dest, orte_app_context_t *src,
(*dest)->add_hostfile = strdup(src->add_hostfile);
}
(*dest)->add_host = opal_argv_copy(src->add_host);
(*dest)->dash_host = opal_argv_copy(src->dash_host);
if (NULL != src->prefix_dir) {

Просмотреть файл

@ -603,6 +603,22 @@ int orte_dt_pack_app_context(opal_buffer_t *buffer, const void *src,
return rc;
}
/* pack the add host argv array */
count = opal_argv_count(app_context[i]->add_host);
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, (void*)(&count), 1, ORTE_STD_CNTR))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* if there are entries, pack the argv entries */
if (0 < count) {
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
(void*)(app_context[i]->add_host), count, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
/* pack the dash host argv array */
count = opal_argv_count(app_context[i]->dash_host);
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, (void*)(&count), 1, ORTE_STD_CNTR))) {

Просмотреть файл

@ -320,7 +320,7 @@ int orte_dt_print_node(char **output, char *prefix, orte_node_t *src, opal_data_
/* just provide a simple output for users */
if (0 == src->num_procs) {
/* no procs mapped yet, so just show allocation */
asprintf(&tmp, "\n%sData for node: Name: %s\tNum slots: %ld\tMax slots: %ld",
asprintf(&tmp, "\n%sData for node: %s\tNum slots: %ld\tMax slots: %ld",
pfx2, (NULL == src->name) ? "UNKNOWN" : src->name,
(long)src->slots, (long)src->slots_max);
/* does this node have any aliases? */
@ -335,7 +335,7 @@ int orte_dt_print_node(char **output, char *prefix, orte_node_t *src, opal_data_
*output = tmp;
return ORTE_SUCCESS;
}
asprintf(&tmp, "\n%sData for node: Name: %s\tNum procs: %ld",
asprintf(&tmp, "\n%sData for node: %s\tNum procs: %ld",
pfx2, (NULL == src->name) ? "UNKNOWN" : src->name,
(long)src->num_procs);
/* does this node have any aliases? */
@ -349,7 +349,7 @@ int orte_dt_print_node(char **output, char *prefix, orte_node_t *src, opal_data_
goto PRINT_PROCS;
}
asprintf(&tmp, "\n%sData for node: Name: %s\t%s\tLaunch id: %ld\tState: %0x",
asprintf(&tmp, "\n%sData for node: %s\t%s\tLaunch id: %ld\tState: %0x",
pfx2, (NULL == src->name) ? "UNKNOWN" : src->name,
pfx2, (long)src->launch_id,
src->state);
@ -555,6 +555,13 @@ int orte_dt_print_app_context(char **output, char *prefix, orte_app_context_t *s
free(tmp);
tmp = tmp2;
count = opal_argv_count(src->add_host);
for (i=0; i < count; i++) {
asprintf(&tmp2, "%s\n%s\tAdd_host[%lu]: %s", tmp, pfx2, (unsigned long)i, src->add_host[i]);
free(tmp);
tmp = tmp2;
}
count = opal_argv_count(src->dash_host);
for (i=0; i < count; i++) {
asprintf(&tmp2, "%s\n%s\tDash_host[%lu]: %s", tmp, pfx2, (unsigned long)i, src->dash_host[i]);

Просмотреть файл

@ -232,6 +232,8 @@ int orte_dt_size_app_context(size_t *size, orte_app_context_t *src, opal_data_ty
*size += strlen(src->add_hostfile); /* add_hostfile name */
}
*size += opal_argv_len(src->add_host);
*size += opal_argv_len(src->dash_host);
if (NULL != src->prefix_dir) {

Просмотреть файл

@ -660,6 +660,30 @@ int orte_dt_unpack_app_context(opal_buffer_t *buffer, void *dest,
return rc;
}
/* get the number of add_host strings that were packed */
max_n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &count, &max_n, ORTE_STD_CNTR))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* if there are dash_host strings, allocate the required space for the char * pointers */
if (0 < count) {
app_context[i]->add_host = (char **)malloc((count+1) * sizeof(char*));
if (NULL == app_context[i]->add_host) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
app_context[i]->add_host[count] = NULL;
/* and unpack them */
max_n = count;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, app_context[i]->add_host, &max_n, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
/* get the number of dash_host strings that were packed */
max_n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &count, &max_n, ORTE_STD_CNTR))) {

Просмотреть файл

@ -453,6 +453,7 @@ static void orte_app_context_construct(orte_app_context_t* app_context)
app_context->user_specified_cwd=false;
app_context->hostfile=NULL;
app_context->add_hostfile=NULL;
app_context->add_host = NULL;
app_context->dash_host = NULL;
app_context->prefix_dir = NULL;
app_context->preload_binary = false;
@ -489,6 +490,10 @@ static void orte_app_context_destructor(orte_app_context_t* app_context)
free(app_context->add_hostfile);
}
if (NULL != app_context->add_host) {
opal_argv_free(app_context->add_host);
}
if (NULL != app_context->dash_host) {
opal_argv_free(app_context->dash_host);
}

Просмотреть файл

@ -166,6 +166,8 @@ typedef struct {
char *hostfile;
/* Hostfile for adding hosts to an existing allocation */
char *add_hostfile;
/* Hosts to be added to an existing allocation - analagous to -host */
char **add_host;
/** argv of hosts passed in to -host */
char ** dash_host;
/** Prefix directory for this app (or NULL if no override necessary) */