Merge pull request #890 from rhc54/topic/fixpmi
Revert "Revert "Fix the handling of cpusets so we get the correct cpu…
Этот коммит содержится в:
Коммит
c31093ff19
@ -140,7 +140,7 @@ static int ompi_proc_complete_init_single (ompi_proc_t *proc)
|
||||
|
||||
/* get the locality information - all RTEs are required
|
||||
* to provide this information at startup */
|
||||
OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCALITY, &proc->super.proc_name, &u16ptr, OPAL_UINT16);
|
||||
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCALITY, &proc->super.proc_name, &u16ptr, OPAL_UINT16);
|
||||
if (OPAL_SUCCESS != ret) {
|
||||
proc->super.proc_flags = OPAL_PROC_NON_LOCAL;
|
||||
} else {
|
||||
@ -149,10 +149,10 @@ static int ompi_proc_complete_init_single (ompi_proc_t *proc)
|
||||
|
||||
/* we can retrieve the hostname at no cost because it
|
||||
* was provided at startup */
|
||||
OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_HOSTNAME, &proc->super.proc_name,
|
||||
(char**)&(proc->super.proc_hostname), OPAL_STRING);
|
||||
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_HOSTNAME, &proc->super.proc_name,
|
||||
(char**)&(proc->super.proc_hostname), OPAL_STRING);
|
||||
if (OPAL_SUCCESS != ret) {
|
||||
return ret;
|
||||
return ret;
|
||||
}
|
||||
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
|
||||
/* get the remote architecture - this might force a modex except
|
||||
@ -345,7 +345,7 @@ int ompi_proc_complete_init(void)
|
||||
|
||||
/* the runtime is required to fill in locality for all local processes by this
|
||||
* point. only local processes will have locality set */
|
||||
OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCALITY, &proc_name, &u16ptr, OPAL_UINT16);
|
||||
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCALITY, &proc_name, &u16ptr, OPAL_UINT16);
|
||||
if (OPAL_SUCCESS == ret) {
|
||||
locality = u16;
|
||||
}
|
||||
|
@ -61,9 +61,10 @@ static int cray_fence_nb(opal_list_t *procs, int collect_data,
|
||||
opal_pmix_op_cbfunc_t cbfunc, void *cbdata);
|
||||
static int cray_commit(void);
|
||||
static int cray_get(const opal_process_name_t *id,
|
||||
const char *key,
|
||||
const char *key, opal_list_t *info,
|
||||
opal_value_t **kv);
|
||||
static int cray_get_nb(const opal_process_name_t *id, const char *key,
|
||||
opal_list_t *info,
|
||||
opal_pmix_value_cbfunc_t cbfunc, void *cbdata);
|
||||
static int cray_publish(opal_list_t *info);
|
||||
static int cray_publish_nb(opal_list_t *info,
|
||||
@ -735,7 +736,7 @@ static int cray_fence_nb(opal_list_t *procs, int collect_data,
|
||||
return OPAL_ERR_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
static int cray_get(const opal_process_name_t *id, const char *key, opal_value_t **kv)
|
||||
static int cray_get(const opal_process_name_t *id, const char *key, opal_list_t *info, opal_value_t **kv)
|
||||
{
|
||||
int rc;
|
||||
opal_list_t vals;
|
||||
@ -762,7 +763,7 @@ static int cray_get(const opal_process_name_t *id, const char *key, opal_value_t
|
||||
}
|
||||
|
||||
static int cray_get_nb(const opal_process_name_t *id, const char *key,
|
||||
opal_pmix_value_cbfunc_t cbfunc, void *cbdata)
|
||||
opal_list_t *info, opal_pmix_value_cbfunc_t cbfunc, void *cbdata)
|
||||
{
|
||||
return OPAL_ERR_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
@ -108,6 +108,43 @@ extern int opal_pmix_verbose_output;
|
||||
free(_key); \
|
||||
} while(0);
|
||||
|
||||
/**
|
||||
* Provide a simplified macro for retrieving modex data
|
||||
* from another process when we don't want the PMIx module
|
||||
* to request it from the server if not found:
|
||||
*
|
||||
* r - the integer return status from the modex op (int)
|
||||
* s - string key (char*)
|
||||
* p - pointer to the opal_process_name_t of the proc that posted
|
||||
* the data (opal_process_name_t*)
|
||||
* d - pointer to a location wherein the data object
|
||||
* is to be returned
|
||||
* t - the expected data type
|
||||
*/
|
||||
#define OPAL_MODEX_RECV_VALUE_OPTIONAL(r, s, p, d, t) \
|
||||
do { \
|
||||
opal_value_t *_kv, *_info; \
|
||||
opal_list_t _ilist; \
|
||||
OPAL_OUTPUT_VERBOSE((1, opal_pmix_verbose_output, \
|
||||
"%s[%s:%d] MODEX RECV VALUE OPTIONAL FOR PROC %s KEY %s", \
|
||||
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), \
|
||||
__FILE__, __LINE__, \
|
||||
OPAL_NAME_PRINT(*(p)), (s))); \
|
||||
OBJ_CONSTRUCT(&(_ilist), opal_list_t); \
|
||||
_info = OBJ_NEW(opal_value_t); \
|
||||
_info->key = strdup(OPAL_PMIX_OPTIONAL); \
|
||||
_info->type = OPAL_BOOL; \
|
||||
_info->data.flag = true; \
|
||||
opal_list_append(&(_ilist), &(_info)->super); \
|
||||
if (OPAL_SUCCESS != ((r) = opal_pmix.get((p), (s), &(_ilist), &(_kv)))) { \
|
||||
*(d) = NULL; \
|
||||
} else { \
|
||||
(r) = opal_value_unload(_kv, (void**)(d), (t)); \
|
||||
OBJ_RELEASE(_kv); \
|
||||
} \
|
||||
OPAL_LIST_DESTRUCT(&(_ilist)); \
|
||||
} while(0);
|
||||
|
||||
/**
|
||||
* Provide a simplified macro for retrieving modex data
|
||||
* from another process:
|
||||
@ -128,7 +165,7 @@ extern int opal_pmix_verbose_output;
|
||||
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), \
|
||||
__FILE__, __LINE__, \
|
||||
OPAL_NAME_PRINT(*(p)), (s))); \
|
||||
if (OPAL_SUCCESS != ((r) = opal_pmix.get((p), (s), &(_kv)))) { \
|
||||
if (OPAL_SUCCESS != ((r) = opal_pmix.get((p), (s), NULL, &(_kv)))) { \
|
||||
*(d) = NULL; \
|
||||
} else { \
|
||||
(r) = opal_value_unload(_kv, (void**)(d), (t)); \
|
||||
@ -157,7 +194,7 @@ extern int opal_pmix_verbose_output;
|
||||
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), \
|
||||
__FILE__, __LINE__, \
|
||||
OPAL_NAME_PRINT(*(p)), (s))); \
|
||||
if (OPAL_SUCCESS == ((r) = opal_pmix.get((p), (s), &(_kv))) && \
|
||||
if (OPAL_SUCCESS == ((r) = opal_pmix.get((p), (s), NULL, &(_kv))) && \
|
||||
NULL != _kv) { \
|
||||
*(d) = _kv->data.bo.bytes; \
|
||||
*(sz) = _kv->data.bo.size; \
|
||||
@ -301,23 +338,24 @@ typedef int (*opal_pmix_base_module_put_fn_t)(opal_pmix_scope_t scope,
|
||||
opal_value_t *val);
|
||||
|
||||
/* Retrieve information for the specified _key_ as published by the rank
|
||||
* and jobid i the provided opal_process_name, returning a pointer to the value in the
|
||||
* given address.
|
||||
* and jobid i the provided opal_process_name, and subject to any provided
|
||||
* constraints, returning a pointer to the value in the given address.
|
||||
*
|
||||
* This is a blocking operation - the caller will block until
|
||||
* the specified data has been _PMIx_Put_ by the specified rank. The caller is
|
||||
* responsible for freeing all memory associated with the returned value when
|
||||
* no longer required. */
|
||||
typedef int (*opal_pmix_base_module_get_fn_t)(const opal_process_name_t *proc,
|
||||
const char *key,
|
||||
const char *key, opal_list_t *info,
|
||||
opal_value_t **val);
|
||||
|
||||
/* Retrieve information for the specified _key_ as published by the given rank
|
||||
* and jobid in the opal_process_name_t. This is a non-blocking operation - the
|
||||
* and jobid in the opal_process_name_t, and subject to any provided
|
||||
* constraints. This is a non-blocking operation - the
|
||||
* callback function will be executed once the specified data has been _PMIx_Put_
|
||||
* by the specified proc and retrieved by the local server. */
|
||||
typedef int (*opal_pmix_base_module_get_nb_fn_t)(const opal_process_name_t *proc,
|
||||
const char *key,
|
||||
const char *key, opal_list_t *info,
|
||||
opal_pmix_value_cbfunc_t cbfunc, void *cbdata);
|
||||
|
||||
/* Publish the given data to the "universal" nspace
|
||||
|
@ -160,12 +160,15 @@ BEGIN_C_DECLS
|
||||
/* request-related info */
|
||||
#define PMIX_COLLECT_DATA "pmix.collect" // (bool) collect data and return it at the end of the operation
|
||||
#define PMIX_TIMEOUT "pmix.timeout" // (int) time in sec before specified operation should time out
|
||||
#define PMIX_WAIT "pmix.wait" // (int) caller requests that the server wait until the specified #values are found
|
||||
#define PMIX_WAIT "pmix.wait" // (int) caller requests that the server wait until at least the specified
|
||||
// #values are found (0 => all and is the default)
|
||||
#define PMIX_COLLECTIVE_ALGO "pmix.calgo" // (char*) comma-delimited list of algorithms to use for collective
|
||||
#define PMIX_COLLECTIVE_ALGO_REQD "pmix.calreqd" // (bool) if true, indicates that the requested choice of algo is mandatory
|
||||
#define PMIX_NOTIFY_COMPLETION "pmix.notecomp" // (bool) notify parent process upon termination of child job
|
||||
#define PMIX_RANGE "pmix.range" // (int) pmix_data_range_t value for calls to publish/lookup/unpublish
|
||||
#define PMIX_PERSISTENCE "pmix.persist" // (int) pmix_persistence_t value for calls to publish
|
||||
#define PMIX_OPTIONAL "pmix.optional" // (bool) look only in the immediate data store for the requested value - do
|
||||
// not request data from the server if not found
|
||||
|
||||
/* attributes used by host server to pass data to the server convenience library - the
|
||||
* data will then be parsed and provided to the local clients */
|
||||
|
@ -117,6 +117,7 @@ pmix_status_t PMIx_Get_nb(const pmix_proc_t *proc, const char *key,
|
||||
pmix_status_t rc;
|
||||
char *nm;
|
||||
pmix_nspace_t *ns, *nptr;
|
||||
size_t n;
|
||||
|
||||
if (NULL == proc) {
|
||||
return PMIX_ERR_BAD_PARAM;
|
||||
@ -252,7 +253,7 @@ pmix_status_t PMIx_Get_nb(const pmix_proc_t *proc, const char *key,
|
||||
* key to eventually be found, so all we can do is return
|
||||
* the error */
|
||||
pmix_output_verbose(2, pmix_globals.debug_output,
|
||||
"Error requesting key=%s for rank = %d, namespace = %s\n",
|
||||
"Error requesting key=%s for rank = %d, namespace = %s",
|
||||
key, proc->rank, nm);
|
||||
return rc;
|
||||
}
|
||||
@ -265,6 +266,18 @@ pmix_status_t PMIx_Get_nb(const pmix_proc_t *proc, const char *key,
|
||||
return PMIX_ERR_NOT_FOUND;
|
||||
}
|
||||
|
||||
/* we also have to check the user's directives to see if they do not want
|
||||
* us to attempt to retrieve it from the server */
|
||||
for (n=0; n < ninfo; n++) {
|
||||
if (0 == strcmp(info[n].key, PMIX_OPTIONAL) &&
|
||||
info[n].value.data.flag) {
|
||||
/* they don't want us to try and retrieve it */
|
||||
pmix_output_verbose(2, pmix_globals.debug_output,
|
||||
"PMIx_Get key=%s for rank = %d, namespace = %s was not found - request was optional",
|
||||
key, proc->rank, nm);
|
||||
return PMIX_ERR_NOT_FOUND;
|
||||
}
|
||||
}
|
||||
/* see if we already have a request in place with the server for data from
|
||||
* this nspace:rank. If we do, then no need to ask again as the
|
||||
* request will return _all_ data from that proc */
|
||||
|
@ -77,18 +77,18 @@ OPAL_MODULE_DECLSPEC int pmix1_client_init(void);
|
||||
OPAL_MODULE_DECLSPEC int pmix1_client_finalize(void);
|
||||
OPAL_MODULE_DECLSPEC int pmix1_initialized(void);
|
||||
OPAL_MODULE_DECLSPEC int pmix1_abort(int flag, const char *msg,
|
||||
opal_list_t *procs);
|
||||
opal_list_t *procs);
|
||||
OPAL_MODULE_DECLSPEC int pmix1_commit(void);
|
||||
OPAL_MODULE_DECLSPEC int pmix1_fence(opal_list_t *procs, int collect_data);
|
||||
OPAL_MODULE_DECLSPEC int pmix1_fencenb(opal_list_t *procs, int collect_data,
|
||||
opal_pmix_op_cbfunc_t cbfunc, void *cbdata);
|
||||
opal_pmix_op_cbfunc_t cbfunc, void *cbdata);
|
||||
OPAL_MODULE_DECLSPEC int pmix1_put(opal_pmix_scope_t scope,
|
||||
opal_value_t *val);
|
||||
OPAL_MODULE_DECLSPEC int pmix1_get(const opal_process_name_t *proc,
|
||||
const char *key, opal_value_t **val);
|
||||
OPAL_MODULE_DECLSPEC int pmix1_getnb(const opal_process_name_t *proc,
|
||||
const char *key,
|
||||
opal_pmix_value_cbfunc_t cbfunc, void *cbdata);
|
||||
OPAL_MODULE_DECLSPEC int pmix1_get(const opal_process_name_t *proc, const char *key,
|
||||
opal_list_t *info, opal_value_t **val);
|
||||
OPAL_MODULE_DECLSPEC int pmix1_getnb(const opal_process_name_t *proc, const char *key,
|
||||
opal_list_t *info,
|
||||
opal_pmix_value_cbfunc_t cbfunc, void *cbdata);
|
||||
OPAL_MODULE_DECLSPEC int pmix1_publish(opal_list_t *info);
|
||||
OPAL_MODULE_DECLSPEC int pmix1_publishnb(opal_list_t *info,
|
||||
opal_pmix_op_cbfunc_t cbfunc, void *cbdata);
|
||||
@ -100,17 +100,17 @@ OPAL_MODULE_DECLSPEC int pmix1_unpublishnb(char **keys, opal_list_t *info,
|
||||
opal_pmix_op_cbfunc_t cbfunc, void *cbdata);
|
||||
OPAL_MODULE_DECLSPEC int pmix1_spawn(opal_list_t *job_info, opal_list_t *apps, opal_jobid_t *jobid);
|
||||
OPAL_MODULE_DECLSPEC int pmix1_spawnnb(opal_list_t *job_info, opal_list_t *apps,
|
||||
opal_pmix_spawn_cbfunc_t cbfunc, void *cbdata);
|
||||
opal_pmix_spawn_cbfunc_t cbfunc, void *cbdata);
|
||||
OPAL_MODULE_DECLSPEC int pmix1_connect(opal_list_t *procs);
|
||||
OPAL_MODULE_DECLSPEC int pmix1_connectnb(opal_list_t *procs,
|
||||
opal_pmix_op_cbfunc_t cbfunc,
|
||||
void *cbdata);
|
||||
opal_pmix_op_cbfunc_t cbfunc,
|
||||
void *cbdata);
|
||||
OPAL_MODULE_DECLSPEC int pmix1_disconnect(opal_list_t *procs);
|
||||
OPAL_MODULE_DECLSPEC int pmix1_disconnectnb(opal_list_t *procs,
|
||||
opal_pmix_op_cbfunc_t cbfunc,
|
||||
void *cbdata);
|
||||
opal_pmix_op_cbfunc_t cbfunc,
|
||||
void *cbdata);
|
||||
OPAL_MODULE_DECLSPEC int pmix1_resolve_peers(const char *nodename, opal_jobid_t jobid,
|
||||
opal_list_t *procs);
|
||||
opal_list_t *procs);
|
||||
OPAL_MODULE_DECLSPEC int pmix1_resolve_nodes(opal_jobid_t jobid, char **nodelist);
|
||||
|
||||
/**** COMMON FUNCTIONS ****/
|
||||
@ -123,32 +123,32 @@ OPAL_MODULE_DECLSPEC int pmix1_server_finalize(void);
|
||||
OPAL_MODULE_DECLSPEC int pmix1_server_gen_regex(const char *input, char **regex);
|
||||
OPAL_MODULE_DECLSPEC int pmix1_server_gen_ppn(const char *input, char **ppn);
|
||||
OPAL_MODULE_DECLSPEC int pmix1_server_register_nspace(opal_jobid_t jobid,
|
||||
int nlocalprocs,
|
||||
opal_list_t *info,
|
||||
opal_pmix_op_cbfunc_t cbfunc,
|
||||
void *cbdata);
|
||||
int nlocalprocs,
|
||||
opal_list_t *info,
|
||||
opal_pmix_op_cbfunc_t cbfunc,
|
||||
void *cbdata);
|
||||
OPAL_MODULE_DECLSPEC int pmix1_server_register_client(const opal_process_name_t *proc,
|
||||
uid_t uid, gid_t gid,
|
||||
void *server_object,
|
||||
opal_pmix_op_cbfunc_t cbfunc,
|
||||
void *cbdata);
|
||||
uid_t uid, gid_t gid,
|
||||
void *server_object,
|
||||
opal_pmix_op_cbfunc_t cbfunc,
|
||||
void *cbdata);
|
||||
OPAL_MODULE_DECLSPEC int pmix1_server_setup_fork(const opal_process_name_t *proc, char ***env);
|
||||
OPAL_MODULE_DECLSPEC int pmix1_server_dmodex(const opal_process_name_t *proc,
|
||||
opal_pmix_modex_cbfunc_t cbfunc, void *cbdata);
|
||||
opal_pmix_modex_cbfunc_t cbfunc, void *cbdata);
|
||||
OPAL_MODULE_DECLSPEC int pmix1_server_notify_error(int status,
|
||||
opal_list_t *procs,
|
||||
opal_list_t *error_procs,
|
||||
opal_list_t *info,
|
||||
opal_pmix_op_cbfunc_t cbfunc, void *cbdata);
|
||||
opal_list_t *procs,
|
||||
opal_list_t *error_procs,
|
||||
opal_list_t *info,
|
||||
opal_pmix_op_cbfunc_t cbfunc, void *cbdata);
|
||||
|
||||
|
||||
/**** COMPONENT UTILITY FUNCTIONS ****/
|
||||
OPAL_MODULE_DECLSPEC pmix_status_t pmix1_convert_opalrc(int rc);
|
||||
OPAL_MODULE_DECLSPEC int pmix1_convert_rc(pmix_status_t rc);
|
||||
OPAL_MODULE_DECLSPEC void pmix1_value_load(pmix_value_t *v,
|
||||
opal_value_t *kv);
|
||||
opal_value_t *kv);
|
||||
OPAL_MODULE_DECLSPEC int pmix1_value_unload(opal_value_t *kv,
|
||||
const pmix_value_t *v);
|
||||
const pmix_value_t *v);
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
|
@ -290,13 +290,16 @@ int pmix1_put(opal_pmix_scope_t scope,
|
||||
return pmix1_convert_rc(rc);
|
||||
}
|
||||
|
||||
int pmix1_get(const opal_process_name_t *proc,
|
||||
const char *key, opal_value_t **val)
|
||||
int pmix1_get(const opal_process_name_t *proc, const char *key,
|
||||
opal_list_t *info, opal_value_t **val)
|
||||
{
|
||||
int ret;
|
||||
pmix_value_t *kv;
|
||||
pmix_status_t rc;
|
||||
pmix_proc_t p, *pptr;
|
||||
size_t ninfo, n;
|
||||
pmix_info_t *pinfo;
|
||||
opal_value_t *ival;
|
||||
|
||||
opal_output_verbose(1, opal_pmix_base_framework.framework_output,
|
||||
"%s PMIx_client get on proc %s key %s",
|
||||
@ -326,8 +329,25 @@ int pmix1_get(const opal_process_name_t *proc,
|
||||
pptr = NULL;
|
||||
}
|
||||
|
||||
if (NULL != info) {
|
||||
ninfo = opal_list_get_size(info);
|
||||
if (0 < ninfo) {
|
||||
PMIX_INFO_CREATE(pinfo, ninfo);
|
||||
n=0;
|
||||
OPAL_LIST_FOREACH(ival, info, opal_value_t) {
|
||||
(void)strncpy(pinfo[n].key, ival->key, PMIX_MAX_KEYLEN);
|
||||
pmix1_value_load(&pinfo[n].value, ival);
|
||||
}
|
||||
} else {
|
||||
pinfo = NULL;
|
||||
}
|
||||
} else {
|
||||
pinfo = NULL;
|
||||
ninfo = 0;
|
||||
}
|
||||
|
||||
/* pass the request down */
|
||||
rc = PMIx_Get(pptr, key, NULL, 0, &kv);
|
||||
rc = PMIx_Get(pptr, key, pinfo, ninfo, &kv);
|
||||
if (PMIX_SUCCESS == rc) {
|
||||
if (NULL == kv) {
|
||||
ret = OPAL_SUCCESS;
|
||||
@ -339,6 +359,7 @@ int pmix1_get(const opal_process_name_t *proc,
|
||||
} else {
|
||||
ret = pmix1_convert_rc(rc);
|
||||
}
|
||||
PMIX_INFO_FREE(pinfo, ninfo);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -362,11 +383,14 @@ static void val_cbfunc(pmix_status_t status,
|
||||
}
|
||||
|
||||
int pmix1_getnb(const opal_process_name_t *proc, const char *key,
|
||||
opal_list_t *info,
|
||||
opal_pmix_value_cbfunc_t cbfunc, void *cbdata)
|
||||
{
|
||||
pmix1_opcaddy_t *op;
|
||||
pmix_status_t rc;
|
||||
char *tmp;
|
||||
size_t n;
|
||||
opal_value_t *ival;
|
||||
|
||||
opal_output_verbose(1, opal_pmix_base_framework.framework_output,
|
||||
"%s PMIx_client get_nb on proc %s key %s",
|
||||
@ -389,9 +413,20 @@ int pmix1_getnb(const opal_process_name_t *proc, const char *key,
|
||||
op->p.rank = PMIX_RANK_WILDCARD;
|
||||
}
|
||||
|
||||
if (NULL != info) {
|
||||
op->sz = opal_list_get_size(info);
|
||||
if (0 < op->sz) {
|
||||
PMIX_INFO_CREATE(op->info, op->sz);
|
||||
n=0;
|
||||
OPAL_LIST_FOREACH(ival, info, opal_value_t) {
|
||||
(void)strncpy(op->info[n].key, ival->key, PMIX_MAX_KEYLEN);
|
||||
pmix1_value_load(&op->info[n].value, ival);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* call the library function */
|
||||
rc = PMIx_Get_nb(&op->p, key, NULL, 0, val_cbfunc, op);
|
||||
rc = PMIx_Get_nb(&op->p, key, op->info, op->sz, val_cbfunc, op);
|
||||
if (PMIX_SUCCESS != rc) {
|
||||
OBJ_RELEASE(op);
|
||||
}
|
||||
|
@ -98,12 +98,15 @@ BEGIN_C_DECLS
|
||||
/* request-related info */
|
||||
#define OPAL_PMIX_COLLECT_DATA "pmix.collect" // (bool) collect data and return it at the end of the operation
|
||||
#define OPAL_PMIX_TIMEOUT "pmix.timeout" // (int) time in sec before specified operation should time out
|
||||
#define OPAL_PMIX_WAIT "pmix.wait" // (int) caller requests that the server wait until the specified #values are found
|
||||
#define OPAL_PMIX_WAIT "pmix.wait" // (int) caller requests that the server wait until at least the specified
|
||||
// #values are found (0 => all and is the default)
|
||||
#define OPAL_PMIX_COLLECTIVE_ALGO "pmix.calgo" // (char*) comma-delimited list of algorithms to use for collective
|
||||
#define OPAL_PMIX_COLLECTIVE_ALGO_REQD "pmix.calreqd" // (bool) if true, indicates that the requested choice of algo is mandatory
|
||||
#define OPAL_PMIX_NOTIFY_COMPLETION "pmix.notecomp" // (bool) notify parent process upon termination of child job
|
||||
#define OPAL_PMIX_RANGE "pmix.range" // (int) opal_pmix_data_range_t value for calls to publish/lookup/unpublish
|
||||
#define OPAL_PMIX_PERSISTENCE "pmix.persist" // (int) opal_pmix_persistence_t value for calls to publish
|
||||
#define OPAL_PMIX_OPTIONAL "pmix.optional" // (bool) look only in the immediate data store for the requested value - do
|
||||
// not request data from the server if not found
|
||||
|
||||
/* attribute used by host server to pass data to the server convenience library - the
|
||||
* data will then be parsed and provided to the local clients */
|
||||
|
@ -39,7 +39,7 @@ static int s1_fence(opal_list_t *procs, int collect_data);
|
||||
static int s1_put(opal_pmix_scope_t scope,
|
||||
opal_value_t *kv);
|
||||
static int s1_get(const opal_process_name_t *id,
|
||||
const char *key,
|
||||
const char *key, opal_list_t *info,
|
||||
opal_value_t **kv);
|
||||
static int s1_publish(opal_list_t *info);
|
||||
static int s1_lookup(opal_list_t *data, opal_list_t *info);
|
||||
@ -588,7 +588,7 @@ static int s1_fence(opal_list_t *procs, int collect_data)
|
||||
}
|
||||
|
||||
static int s1_get(const opal_process_name_t *id,
|
||||
const char *key,
|
||||
const char *key, opal_list_t *info,
|
||||
opal_value_t **kv)
|
||||
{
|
||||
int rc;
|
||||
|
@ -46,7 +46,7 @@ static int s2_fence(opal_list_t *procs, int collect_data);
|
||||
static int s2_put(opal_pmix_scope_t scope,
|
||||
opal_value_t *kv);
|
||||
static int s2_get(const opal_process_name_t *id,
|
||||
const char *key,
|
||||
const char *key, opal_list_t *info,
|
||||
opal_value_t **kv);
|
||||
static int s2_publish(opal_list_t *info);
|
||||
static int s2_lookup(opal_list_t *data, opal_list_t *info);
|
||||
@ -607,7 +607,7 @@ static int s2_fence(opal_list_t *procs, int collect_data)
|
||||
}
|
||||
|
||||
static int s2_get(const opal_process_name_t *id,
|
||||
const char *key,
|
||||
const char *key, opal_list_t *info,
|
||||
opal_value_t **kv)
|
||||
{
|
||||
int rc;
|
||||
|
@ -88,10 +88,9 @@ static int rte_init(void)
|
||||
char *rmluri;
|
||||
opal_value_t *kv;
|
||||
char *val;
|
||||
size_t sz;
|
||||
int u32, *u32ptr;
|
||||
uint16_t u16, *u16ptr;
|
||||
char **peers=NULL, *mycpuset;
|
||||
char **peers=NULL, *mycpuset, **cpusets=NULL;
|
||||
opal_process_name_t name;
|
||||
size_t i;
|
||||
|
||||
@ -153,8 +152,8 @@ static int rte_init(void)
|
||||
|
||||
|
||||
/* get our app number from PMI - ok if not found */
|
||||
OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_APPNUM,
|
||||
ORTE_PROC_MY_NAME, &u32ptr, OPAL_UINT32);
|
||||
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_APPNUM,
|
||||
ORTE_PROC_MY_NAME, &u32ptr, OPAL_UINT32);
|
||||
if (OPAL_SUCCESS == ret) {
|
||||
orte_process_info.app_num = u32;
|
||||
} else {
|
||||
@ -190,8 +189,8 @@ static int rte_init(void)
|
||||
}
|
||||
|
||||
/* retrieve our topology */
|
||||
OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_TOPO,
|
||||
ORTE_PROC_MY_NAME, &val, OPAL_STRING);
|
||||
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCAL_TOPO,
|
||||
ORTE_PROC_MY_NAME, &val, OPAL_STRING);
|
||||
if (OPAL_SUCCESS == ret && NULL != val) {
|
||||
/* load the topology */
|
||||
if (0 != hwloc_topology_init(&opal_hwloc_topology)) {
|
||||
@ -259,84 +258,77 @@ static int rte_init(void)
|
||||
|
||||
/* get our local peers */
|
||||
if (0 < orte_process_info.num_local_peers) {
|
||||
/* if my local rank if too high, then that's an error */
|
||||
if (orte_process_info.num_local_peers < orte_process_info.my_local_rank) {
|
||||
ret = ORTE_ERR_BAD_PARAM;
|
||||
error = "num local peers";
|
||||
goto error;
|
||||
}
|
||||
/* retrieve the local peers */
|
||||
OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_PEERS,
|
||||
ORTE_PROC_MY_NAME, &val, OPAL_STRING);
|
||||
if (OPAL_SUCCESS == ret && NULL != val) {
|
||||
peers = opal_argv_split(val, ',');
|
||||
free(val);
|
||||
/* and their cpusets, if available */
|
||||
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCAL_CPUSETS, ORTE_PROC_MY_NAME, &val, OPAL_STRING);
|
||||
if (OPAL_SUCCESS == ret && NULL != val) {
|
||||
cpusets = opal_argv_split(val, ':');
|
||||
free(val);
|
||||
} else {
|
||||
cpusets = NULL;
|
||||
}
|
||||
} else {
|
||||
peers = NULL;
|
||||
cpusets = NULL;
|
||||
}
|
||||
} else {
|
||||
peers = NULL;
|
||||
}
|
||||
|
||||
/* get our cpuset */
|
||||
OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_CPUSET, ORTE_PROC_MY_NAME, &val, OPAL_STRING);
|
||||
if (OPAL_SUCCESS != ret || NULL == val) {
|
||||
/* if we don't have a cpuset, or it is NULL, then we declare our local
|
||||
* peers to be on the same node and everyone else to be non-local */
|
||||
mycpuset = NULL;
|
||||
} else {
|
||||
mycpuset = val;
|
||||
cpusets = NULL;
|
||||
}
|
||||
|
||||
/* set the locality */
|
||||
name.jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
for (sz=0; sz < orte_process_info.num_procs; sz++) {
|
||||
kv = OBJ_NEW(opal_value_t);
|
||||
kv->key = strdup(OPAL_PMIX_LOCALITY);
|
||||
kv->type = OPAL_UINT16;
|
||||
name.vpid = sz;
|
||||
if (sz == ORTE_PROC_MY_NAME->vpid) {
|
||||
/* we are fully local to ourselves */
|
||||
u16 = OPAL_PROC_ALL_LOCAL;
|
||||
} else if (NULL == peers) {
|
||||
/* nobody is local to us */
|
||||
u16 = OPAL_PROC_NON_LOCAL;
|
||||
if (NULL != peers) {
|
||||
/* indentify our cpuset */
|
||||
if (NULL != cpusets) {
|
||||
mycpuset = cpusets[orte_process_info.my_local_rank];
|
||||
} else {
|
||||
for (i=0; NULL != peers[i]; i++) {
|
||||
if (sz == strtoul(peers[i], NULL, 10)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (NULL == peers[i]) {
|
||||
/* not a local peer */
|
||||
u16 = OPAL_PROC_NON_LOCAL;
|
||||
} else if (NULL == mycpuset) {
|
||||
mycpuset = NULL;
|
||||
}
|
||||
name.jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
for (i=0; NULL != peers[i]; i++) {
|
||||
kv = OBJ_NEW(opal_value_t);
|
||||
kv->key = strdup(OPAL_PMIX_LOCALITY);
|
||||
kv->type = OPAL_UINT16;
|
||||
name.vpid = strtoul(peers[i], NULL, 10);
|
||||
if (name.vpid == ORTE_PROC_MY_NAME->vpid) {
|
||||
/* we are fully local to ourselves */
|
||||
u16 = OPAL_PROC_ALL_LOCAL;
|
||||
} else if (NULL == mycpuset || NULL == cpusets[i] ||
|
||||
0 == strcmp(cpusets[i], "UNBOUND")) {
|
||||
/* all we can say is that it shares our node */
|
||||
u16 = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE;
|
||||
} else {
|
||||
/* attempt to get their cpuset */
|
||||
OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_CPUSET, &name, &val, OPAL_STRING);
|
||||
if (OPAL_SUCCESS == ret && NULL != val) {
|
||||
/* we have it, so compute the locality */
|
||||
u16 = opal_hwloc_base_get_relative_locality(opal_hwloc_topology,
|
||||
mycpuset, val);
|
||||
free(val);
|
||||
} else {
|
||||
/* all we can say is that it shares our node */
|
||||
u16 = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE;
|
||||
}
|
||||
/* we have it, so compute the locality */
|
||||
u16 = opal_hwloc_base_get_relative_locality(opal_hwloc_topology, mycpuset, cpusets[i]);
|
||||
}
|
||||
}
|
||||
kv->data.uint16 = u16;
|
||||
ret = opal_pmix.store_local(&name, kv);
|
||||
if (OPAL_SUCCESS != ret) {
|
||||
error = "local store of locality";
|
||||
if (NULL != mycpuset) {
|
||||
free(mycpuset);
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_framework.framework_output,
|
||||
"%s ess:pmi:locality: proc %s locality %x",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&name), u16));
|
||||
kv->data.uint16 = u16;
|
||||
ret = opal_pmix.store_local(&name, kv);
|
||||
if (OPAL_SUCCESS != ret) {
|
||||
error = "local store of locality";
|
||||
opal_argv_free(peers);
|
||||
opal_argv_free(cpusets);
|
||||
goto error;
|
||||
}
|
||||
opal_argv_free(peers);
|
||||
goto error;
|
||||
OBJ_RELEASE(kv);
|
||||
}
|
||||
OBJ_RELEASE(kv);
|
||||
opal_argv_free(peers);
|
||||
opal_argv_free(cpusets);
|
||||
}
|
||||
if (NULL != mycpuset){
|
||||
free(mycpuset);
|
||||
}
|
||||
opal_argv_free(peers);
|
||||
|
||||
/* now that we have all required info, complete the setup */
|
||||
if (ORTE_SUCCESS != (ret = orte_ess_base_app_setup(false))) {
|
||||
|
@ -40,7 +40,7 @@ void orte_oob_base_send_nb(int fd, short args, void *cbdata)
|
||||
bool msg_sent;
|
||||
mca_oob_base_component_t *component;
|
||||
bool reachable;
|
||||
opal_value_t *kv;
|
||||
char *uri;
|
||||
|
||||
/* done with this. release it now */
|
||||
OBJ_RELEASE(cd);
|
||||
@ -61,11 +61,14 @@ void orte_oob_base_send_nb(int fd, short args, void *cbdata)
|
||||
ORTE_NAME_PRINT(&msg->dst));
|
||||
/* for direct launched procs, the URI might be in the database,
|
||||
* so check there next - if it is, the peer object will be added
|
||||
* to our hash table
|
||||
* to our hash table. However, we don't want to chase up to the
|
||||
* server after it, so indicate it is optional
|
||||
*/
|
||||
if (OPAL_SUCCESS == opal_pmix.get(&msg->dst, OPAL_PMIX_PROC_URI, &kv)) {
|
||||
if (NULL != kv) {
|
||||
process_uri(kv->data.string);
|
||||
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, OPAL_PMIX_PROC_URI, &msg->dst,
|
||||
(char**)&uri, OPAL_STRING);
|
||||
if (OPAL_SUCCESS == rc ) {
|
||||
if (NULL != uri) {
|
||||
process_uri(uri);
|
||||
if (OPAL_SUCCESS != opal_hash_table_get_value_uint64(&orte_oob_base.peers,
|
||||
ui64, (void**)&pr) ||
|
||||
NULL == pr) {
|
||||
|
@ -481,6 +481,7 @@ static void pmix_server_dmdx_recv(int status, orte_process_name_t* sender,
|
||||
|
||||
/* ask our local pmix server for the data */
|
||||
if (OPAL_SUCCESS != (rc = opal_pmix.server_dmodex_request(&idreq, modex_resp, req))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
opal_hotel_checkout(&orte_pmix_server_globals.reqs, req->room_num);
|
||||
OBJ_RELEASE(req);
|
||||
send_error(rc, &idreq, sender);
|
||||
|
@ -236,6 +236,7 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata)
|
||||
/* construct the list of local peers, while adding
|
||||
* each proc's locality info */
|
||||
list = NULL;
|
||||
procs = NULL;
|
||||
vpid = ORTE_VPID_MAX;
|
||||
for (i=0; i < node->procs->size; i++) {
|
||||
if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
|
||||
@ -249,13 +250,15 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata)
|
||||
/* note that we have to pass the cpuset for each local
|
||||
* peer so locality can be computed */
|
||||
tmp = NULL;
|
||||
kv = OBJ_NEW(opal_value_t);
|
||||
kv->key = strdup(OPAL_PMIX_CPUSET);
|
||||
kv->type = OPAL_STRING;
|
||||
if (orte_get_attribute(&pptr->attributes, ORTE_PROC_CPU_BITMAP, (void**)&tmp, OPAL_STRING)) {
|
||||
kv->data.string = tmp;
|
||||
if (NULL != tmp) {
|
||||
opal_argv_append_nosize(&procs, tmp);
|
||||
} else {
|
||||
opal_argv_append_nosize(&procs, "UNBOUND");
|
||||
}
|
||||
} else {
|
||||
opal_argv_append_nosize(&procs, "UNBOUND");
|
||||
}
|
||||
opal_list_append(info, &kv->super);
|
||||
/* go ahead and register this client */
|
||||
if (OPAL_SUCCESS != (rc = opal_pmix.server_register_client(&pptr->name, uid, gid,
|
||||
(void*)pptr, NULL, NULL))) {
|
||||
@ -276,6 +279,18 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata)
|
||||
kv->data.string = tmp;
|
||||
opal_list_append(info, &kv->super);
|
||||
}
|
||||
/* construct the list of cpusets for transmission */
|
||||
if (NULL != procs) {
|
||||
tmp = opal_argv_join(procs, ':');
|
||||
opal_argv_free(procs);
|
||||
procs = NULL;
|
||||
/* pass the list of cpusets */
|
||||
kv = OBJ_NEW(opal_value_t);
|
||||
kv->key = strdup(OPAL_PMIX_LOCAL_CPUSETS);
|
||||
kv->type = OPAL_STRING;
|
||||
kv->data.string = tmp;
|
||||
opal_list_append(info, &kv->super);
|
||||
}
|
||||
/* pass the local ldr */
|
||||
kv = OBJ_NEW(opal_value_t);
|
||||
kv->key = strdup(OPAL_PMIX_LOCALLDR);
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user