1
1

Merge pull request #2432 from hppritcha/topic/fix_info_etc_w_alps

pmix/cray: set some envars for MPI_INFO_ENV object
Этот коммит содержится в:
Howard Pritchard 2016-11-18 14:23:53 -07:00 коммит произвёл GitHub
родитель 184148d26a de3de131af
Коммит 44f4663d0d
2 изменённых файлов: 140 добавлений и 6 удалений

Просмотреть файл

@ -18,7 +18,6 @@
#include "opal_config.h"
#include "opal/constants.h"
#include "opal/types.h"
#include "opal_stdint.h"
#include "opal/mca/hwloc/base/base.h"
#include "opal/util/argv.h"
@ -27,11 +26,8 @@
#include "opal/util/proc.h"
#include "opal/util/output.h"
#include "opal/util/show_help.h"
#include <string.h>
#include <pmi.h>
#include <pmi2.h>
#include "opal/util/opal_getcwd.h"
#include "opal/constants.h"
#include "opal/mca/pmix/base/base.h"
#include "opal/mca/pmix/base/pmix_base_hash.h"
#include "pmix_cray.h"
@ -158,6 +154,133 @@ static char* pmix_error(int pmix_err);
pmix_error(pmi_err)); \
} while(0);
static void cray_get_more_info(void)
{
int alps_status = 0, i;
uint64_t apid;
size_t alps_count;
int lli_ret = 0, place_ret;
alpsAppLayout_t layout;
char *npstring;
char *firstrankstring;
char **nps, **firstranks;
int *base_pe_in_app;
int *pes_in_app;
char pbuf[OPAL_PATH_MAX];
/*
* First get our apid
*/
lli_ret = alps_app_lli_lock();
if (0 != lli_ret) {
OPAL_OUTPUT_VERBOSE((20, opal_pmix_base_framework.framework_output,
"%s pmix:cray: alps_app_lli_lock returned %d",
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), lli_ret));
goto fn_exit;
}
lli_ret = alps_app_lli_put_request(ALPS_APP_LLI_ALPS_REQ_APID, NULL, 0);
if (ALPS_APP_LLI_ALPS_STAT_OK != lli_ret) {
OPAL_OUTPUT_VERBOSE((20, opal_pmix_base_framework.framework_output,
"%s pmix:cray: alps_app_lli_put_request - APID returned %d",
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), lli_ret));
goto fn_exit_w_lock;
}
lli_ret = alps_app_lli_get_response (&alps_status, &alps_count);
if (ALPS_APP_LLI_ALPS_STAT_OK != alps_status) {
OPAL_OUTPUT_VERBOSE((20, opal_pmix_base_framework.framework_output,
"%s pmix:cray: alps_app_lli_get_response returned %d",
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), alps_status));
goto fn_exit_w_lock;
}
lli_ret = alps_app_lli_get_response_bytes (&apid, sizeof(apid));
if (ALPS_APP_LLI_ALPS_STAT_OK != lli_ret) {
OPAL_OUTPUT_VERBOSE((20, opal_pmix_base_framework.framework_output,
"%s pmix:cray: alps_app_lli_get_response_bytes returned %d",
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), lli_ret));
goto fn_exit_w_lock;
}
/*
* get some items from alps placement file
*/
place_ret = alps_get_placement_info(apid,
&layout,
NULL,
NULL,
NULL,
NULL,
NULL,
&base_pe_in_app,
&pes_in_app,
NULL,
NULL);
if (1 != place_ret) {
OPAL_OUTPUT_VERBOSE((20, opal_pmix_base_framework.framework_output,
"%s pmix:cray: alps_get_placement_info returned %d (%s)",
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), place_ret, strerror(errno)));
goto fn_exit;
}
OPAL_OUTPUT_VERBOSE((2, opal_pmix_base_framework.framework_output,
"%s pmix:cray: alps_get_placement_info returned %d first pe on node is %d",
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), place_ret, layout.firstPe));
nps = NULL;
firstranks = NULL;
for (i=0; i < layout.numCmds; i++) {
snprintf(pbuf, sizeof(pbuf), "%d", pes_in_app[i]);
opal_argv_append_nosize(&nps, pbuf);
snprintf(pbuf, sizeof(pbuf), "%d", base_pe_in_app[i]);
opal_argv_append_nosize(&firstranks, pbuf);
}
npstring = opal_argv_join(nps, ' ');
firstrankstring = opal_argv_join(firstranks, ' ');
opal_argv_free(nps);
opal_argv_free(firstranks);
/*
* stuff values into environment variables
*/
/* add these envars to prep MPI-2 info pre-defined key/values */
snprintf(pbuf, sizeof(pbuf), "%d", layout.numCmds);
opal_setenv("OMPI_NUM_APP_CTX", pbuf, true, &environ);
opal_setenv("OMPI_FIRST_RANKS", firstrankstring, true, &environ);
opal_setenv("OMPI_APP_CTX_NUM_PROCS", npstring, true, &environ);
free(firstrankstring);
free(npstring);
free(base_pe_in_app);
free(pes_in_app);
/*
* ALPS always starts the application in the directory
* where the aprun command was run to do the launch.
* For SLURM, we have to check the SLURM_WORKING_DIR env.
* variable. If it is set, we can't set wdir since
* we can't assume PWD is where we started.
*/
if(getenv("SLURM_WORKING_DIR") == NULL) {
opal_getcwd(pbuf, OPAL_PATH_MAX);
opal_setenv("OMPI_MCA_initial_wdir", pbuf, true, &environ);
}
fn_exit_w_lock:
lli_ret = alps_app_lli_unlock();
if (ALPS_APP_LLI_ALPS_STAT_OK != lli_ret) {
OPAL_OUTPUT_VERBOSE((20, opal_pmix_base_framework.framework_output,
"%s pmix:cray: alps_app_lli_unlock returned %d",
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), lli_ret));
}
fn_exit:
return;
}
static int cray_init(void)
{
int i, spawned, size, rank, appnum, my_node;
@ -419,6 +542,8 @@ static int cray_init(void)
}
OBJ_DESTRUCT(&kv);
cray_get_more_info();
return OPAL_SUCCESS;
err_exit:
PMI2_Finalize();

Просмотреть файл

@ -10,12 +10,21 @@
#ifndef MCA_PMIX_CRAY_H
#define MCA_PMIX_CRAY_H
#include <pmi.h>
#include <pmi2.h>
#include "opal_config.h"
#include "opal/mca/mca.h"
#include "opal/mca/pmix/pmix.h"
#include "opal/mca/pmix/base/pmix_base_fns.h"
#include "opal/util/proc.h"
#include "alps/alps.h"
#include "alps/alps_toolAssist.h"
#include "alps/libalpsutil.h"
#include "alps/libalpslli.h"
#include "pmix_cray_pmap_parser.h"
BEGIN_C_DECLS