From f6948c2bb475fcc90f581b2bd3edacb4a9b71d4b Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Fri, 4 Sep 2015 08:29:09 -0700 Subject: [PATCH] Sync with PMIx master 43e45c3. Get multi-node publish/lookup/unpublish working --- ompi/dpm/dpm.c | 34 +- .../pessimist/vprotocol_pessimist_eventlog.c | 2 +- ompi/mpi/c/lookup_name.c | 28 +- ompi/mpi/c/publish_name.c | 71 ++-- ompi/mpi/c/unpublish_name.c | 28 +- opal/mca/pmix/pmix.h | 18 +- opal/mca/pmix/pmix1xx/pmix/VERSION | 4 +- opal/mca/pmix/pmix1xx/pmix/examples/pub.c | 6 +- opal/mca/pmix/pmix1xx/pmix/examples/server.c | 20 +- opal/mca/pmix/pmix1xx/pmix/include/pmix.h | 182 +++++------ .../pmix/include/pmix/pmix_common.h.in | 3 + .../pmix/pmix1xx/pmix/include/pmix_server.h | 51 ++- opal/mca/pmix/pmix1xx/pmix/src/client/pmi1.c | 8 +- opal/mca/pmix/pmix1xx/pmix/src/client/pmi2.c | 6 +- .../pmix1xx/pmix/src/client/pmix_client.c | 24 +- .../pmix1xx/pmix/src/client/pmix_client_pub.c | 95 +++--- .../pmix1xx/pmix/src/server/pmix_server.c | 13 +- .../pmix1xx/pmix/src/server/pmix_server_ops.c | 95 +++--- .../pmix/pmix1xx/pmix/test/server_callbacks.c | 27 +- .../pmix/pmix1xx/pmix/test/server_callbacks.h | 9 +- .../pmix1xx/pmix/test/simple/simpclient.c | 10 +- .../pmix/pmix1xx/pmix/test/simple/simppub.c | 10 +- .../pmix/pmix1xx/pmix/test/simple/simptest.c | 114 ++++--- opal/mca/pmix/pmix1xx/pmix/test/test_fence.c | 13 +- .../mca/pmix/pmix1xx/pmix/test/test_publish.c | 12 +- opal/mca/pmix/pmix1xx/pmix1.h | 23 +- opal/mca/pmix/pmix1xx/pmix1_client.c | 302 ++++++++---------- opal/mca/pmix/pmix1xx/pmix1_server_north.c | 105 +----- opal/mca/pmix/pmix1xx/pmix1_server_south.c | 41 +++ opal/mca/pmix/pmix1xx/pmix_pmix1.c | 31 +- opal/mca/pmix/pmix_server.h | 12 +- opal/mca/pmix/pmix_types.h | 22 +- orte/mca/ess/base/ess_base_std_orted.c | 10 +- orte/mca/ess/hnp/ess_hnp_module.c | 14 +- orte/mca/schizo/base/base.h | 1 - orte/mca/schizo/base/schizo_base_stubs.c | 3 +- orte/mca/schizo/ompi/schizo_ompi.c | 7 - orte/mca/schizo/schizo.h | 1 - orte/orted/pmix/pmix_server.c | 124 ++++++- orte/orted/pmix/pmix_server_internal.h | 15 +- orte/orted/pmix/pmix_server_pub.c | 138 ++++---- orte/runtime/orte_data_server.c | 203 ++++++++---- orte/tools/orte-submit/orte-submit.c | 2 +- orte/tools/orterun/orterun.c | 181 +---------- orte/tools/orterun/orterun.h | 3 - 45 files changed, 1041 insertions(+), 1080 deletions(-) diff --git a/ompi/dpm/dpm.c b/ompi/dpm/dpm.c index abded3c592..7fa7be42aa 100644 --- a/ompi/dpm/dpm.c +++ b/ompi/dpm/dpm.c @@ -73,17 +73,21 @@ static OBJ_CLASS_INSTANCE(ompi_dpm_proct_caddy_t, NULL, NULL); struct lookup_caddy_t { - bool active; + volatile bool active; + int status; opal_pmix_pdata_t *pdat; }; static void lookup_cbfunc(int status, opal_list_t *data, void *cbdata) { struct lookup_caddy_t *cd = (struct lookup_caddy_t*)cbdata; - opal_pmix_pdata_t *p = (opal_pmix_pdata_t*)opal_list_get_first(data); - if (NULL != p && OPAL_STRING == p->value.type && - NULL != p->value.data.string) { - cd->pdat->value.data.string = strdup(p->value.data.string); + cd->status = status; + if (OPAL_SUCCESS == status && NULL != data) { + opal_pmix_pdata_t *p = (opal_pmix_pdata_t*)opal_list_get_first(data); + if (NULL != p && OPAL_STRING == p->value.type && + NULL != p->value.data.string) { + cd->pdat->value.data.string = strdup(p->value.data.string); + } } cd->active = false; } @@ -197,17 +201,13 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root, if (send_first) { (void)asprintf(&info->key, "%s:connect", port_string); - info->type = OPAL_STRING; - info->data.string = opal_argv_join(members, ':'); } else { (void)asprintf(&info->key, "%s:accept", port_string); - info->type = OPAL_STRING; - info->data.string = opal_argv_join(members, ':'); } + info->type = OPAL_STRING; + info->data.string = opal_argv_join(members, ':'); /* publish it with "session" scope */ - rc = opal_pmix.publish(OPAL_PMIX_SESSION, - OPAL_PMIX_PERSIST_APP, - &ilist); + rc = opal_pmix.publish(&ilist); OPAL_LIST_DESTRUCT(&ilist); if (OPAL_SUCCESS != rc) { opal_argv_free(members); @@ -228,7 +228,7 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root, } opal_list_append(&ilist, &pdat->super); if (NULL == opal_pmix.lookup_nb) { - rc = opal_pmix.lookup(OPAL_PMIX_SESSION, &ilist); + rc = opal_pmix.lookup(&ilist, NULL); if (OPAL_SUCCESS != rc) { OPAL_LIST_DESTRUCT(&ilist); opal_argv_free(members); @@ -242,8 +242,7 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root, opal_argv_append_nosize(&keys, pdat->value.key); caddy.active = true; caddy.pdat = pdat; - rc = opal_pmix.lookup_nb(OPAL_PMIX_SESSION, true, keys, - lookup_cbfunc, &caddy); + rc = opal_pmix.lookup_nb(keys, NULL, lookup_cbfunc, &caddy); if (OPAL_SUCCESS != rc) { OPAL_LIST_DESTRUCT(&ilist); opal_argv_free(keys); @@ -252,6 +251,11 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root, } OMPI_WAIT_FOR_COMPLETION(caddy.active); opal_argv_free(keys); + if (OPAL_SUCCESS != caddy.status) { + OPAL_LIST_DESTRUCT(&ilist); + opal_argv_free(members); + return OMPI_ERROR; + } } /* initiate a list of participants for the connect, * starting with our own members, remembering to diff --git a/ompi/mca/vprotocol/pessimist/vprotocol_pessimist_eventlog.c b/ompi/mca/vprotocol/pessimist/vprotocol_pessimist_eventlog.c index 054ce80119..2892c4da11 100644 --- a/ompi/mca/vprotocol/pessimist/vprotocol_pessimist_eventlog.c +++ b/ompi/mca/vprotocol/pessimist/vprotocol_pessimist_eventlog.c @@ -30,7 +30,7 @@ int vprotocol_pessimist_event_logger_connect(int el_rank, ompi_communicator_t ** asprintf(&pdat->value.key, VPROTOCOL_EVENT_LOGGER_NAME_FMT, el_rank); opal_list_append(&results, &pdat->super); - rc = opal_pmix.lookup(OPAL_PMIX_NAMESPACE, &results); + rc = opal_pmix.lookup(&results, NULL); if (OPAL_SUCCESS != rc || OPAL_STRING != pdat->value.type || NULL == pdat->value.data.string) { diff --git a/ompi/mpi/c/lookup_name.c b/ompi/mpi/c/lookup_name.c index be4838b242..d51a357783 100644 --- a/ompi/mpi/c/lookup_name.c +++ b/ompi/mpi/c/lookup_name.c @@ -46,9 +46,8 @@ int MPI_Lookup_name(const char *service_name, MPI_Info info, char *port_name) { char range[OPAL_MAX_INFO_VAL]; int flag=0, ret; - opal_pmix_data_range_t rng; - bool range_given = false; - opal_list_t results; + opal_value_t *rng; + opal_list_t results, pinfo; opal_pmix_pdata_t *pdat; if ( MPI_PARAM_CHECK ) { @@ -70,27 +69,33 @@ int MPI_Lookup_name(const char *service_name, MPI_Info info, char *port_name) OPAL_CR_ENTER_LIBRARY(); + OBJ_CONSTRUCT(&pinfo, opal_list_t); + /* OMPI supports info keys to pass the range to * be searched for the given key */ if (MPI_INFO_NULL != info) { ompi_info_get (info, "range", sizeof(range) - 1, range, &flag); if (flag) { - range_given = true; if (0 == strcmp(range, "nspace")) { - rng = OPAL_PMIX_NAMESPACE; // share only with procs in same nspace + rng = OBJ_NEW(opal_value_t); + rng->key = strdup(OPAL_PMIX_RANGE); + rng->type = OPAL_INT; + rng->data.integer = OPAL_PMIX_NAMESPACE; // share only with procs in same nspace + opal_list_append(&pinfo, &rng->super); } else if (0 == strcmp(range, "session")) { - rng = OPAL_PMIX_SESSION; // share only with procs in same session + rng = OBJ_NEW(opal_value_t); + rng->key = strdup(OPAL_PMIX_RANGE); + rng->type = OPAL_INT; + rng->data.integer = OPAL_PMIX_SESSION; // share only with procs in same session + opal_list_append(&pinfo, &rng->super); } else { /* unrecognized scope */ + OPAL_LIST_DESTRUCT(&pinfo); return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_ARG, FUNC_NAME); } } } - if (!range_given) { - /* default to nspace */ - rng = OPAL_PMIX_NAMESPACE; - } /* collect the findings */ OBJ_CONSTRUCT(&results, opal_list_t); @@ -98,7 +103,8 @@ int MPI_Lookup_name(const char *service_name, MPI_Info info, char *port_name) pdat->value.key = strdup(service_name); opal_list_append(&results, &pdat->super); - ret = opal_pmix.lookup(rng, &results); + ret = opal_pmix.lookup(&results, &pinfo); + OPAL_LIST_DESTRUCT(&pinfo); if (OPAL_SUCCESS != ret || OPAL_STRING != pdat->value.type || NULL == pdat->value.data.string) { diff --git a/ompi/mpi/c/publish_name.c b/ompi/mpi/c/publish_name.c index b9fee343b2..30dbe5599d 100644 --- a/ompi/mpi/c/publish_name.c +++ b/ompi/mpi/c/publish_name.c @@ -48,12 +48,8 @@ int MPI_Publish_name(const char *service_name, MPI_Info info, int rc; char range[OPAL_MAX_INFO_VAL]; int flag=0; - opal_pmix_data_range_t rng; - bool range_given = false; - opal_pmix_persistence_t persist; - bool persistence_given = false; + opal_value_t *rng; opal_list_t values; - opal_value_t *pinfo; if ( MPI_PARAM_CHECK ) { OMPI_ERR_INIT_FINALIZE(FUNC_NAME); @@ -73,58 +69,75 @@ int MPI_Publish_name(const char *service_name, MPI_Info info, } OPAL_CR_ENTER_LIBRARY(); + OBJ_CONSTRUCT(&values, opal_list_t); /* OMPI supports info keys to pass the range and persistence to * be used for the given key */ if (MPI_INFO_NULL != info) { ompi_info_get (info, "range", sizeof(range) - 1, range, &flag); if (flag) { - range_given = true; if (0 == strcmp(range, "nspace")) { - rng = OPAL_PMIX_NAMESPACE; // share only with procs in same nspace + rng = OBJ_NEW(opal_value_t); + rng->key = strdup(OPAL_PMIX_RANGE); + rng->type = OPAL_INT; + rng->data.integer = OPAL_PMIX_NAMESPACE; // share only with procs in same nspace + opal_list_append(&values, &rng->super); } else if (0 == strcmp(range, "session")) { - rng = OPAL_PMIX_SESSION; // share only with procs in same session + rng = OBJ_NEW(opal_value_t); + rng->key = strdup(OPAL_PMIX_RANGE); + rng->type = OPAL_INT; + rng->data.integer = OPAL_PMIX_SESSION; // share only with procs in same session + opal_list_append(&values, &rng->super); } else { - /* unrecognized range */ + /* unrecognized scope */ + OPAL_LIST_DESTRUCT(&values); return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_ARG, FUNC_NAME); } } ompi_info_get (info, "persistence", sizeof(range) - 1, range, &flag); if (flag) { - persistence_given = true; if (0 == strcmp(range, "indef")) { - persist = OPAL_PMIX_PERSIST_INDEF; // retain until specifically deleted + rng = OBJ_NEW(opal_value_t); + rng->key = strdup(OPAL_PMIX_PERSISTENCE); + rng->type = OPAL_INT; + rng->data.integer = OPAL_PMIX_PERSIST_INDEF; // retain until specifically deleted + opal_list_append(&values, &rng->super); } else if (0 == strcmp(range, "proc")) { - persist = OPAL_PMIX_PERSIST_PROC; // retain until publishing process terminates + rng = OBJ_NEW(opal_value_t); + rng->key = strdup(OPAL_PMIX_PERSISTENCE); + rng->type = OPAL_INT; + rng->data.integer = OPAL_PMIX_PERSIST_PROC; // retain until publishing process terminates + opal_list_append(&values, &rng->super); } else if (0 == strcmp(range, "app")) { - persist = OPAL_PMIX_PERSIST_APP; // retain until application terminates + rng = OBJ_NEW(opal_value_t); + rng->key = strdup(OPAL_PMIX_PERSISTENCE); + rng->type = OPAL_INT; + rng->data.integer = OPAL_PMIX_PERSIST_APP; // retain until application terminates + opal_list_append(&values, &rng->super); } else if (0 == strcmp(range, "session")) { - persist = OPAL_PMIX_PERSIST_SESSION; // retain until session/allocation terminates + rng = OBJ_NEW(opal_value_t); + rng->key = strdup(OPAL_PMIX_PERSISTENCE); + rng->type = OPAL_INT; + rng->data.integer = OPAL_PMIX_PERSIST_SESSION; // retain until session/allocation terminates + opal_list_append(&values, &rng->super); } else { /* unrecognized persistence */ + OPAL_LIST_DESTRUCT(&values); return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_ARG, FUNC_NAME); } } } - if (!range_given) { - /* default to nspace */ - rng = OPAL_PMIX_NAMESPACE; - } - if (!persistence_given) { - persist = OPAL_PMIX_PERSIST_APP; - } - /* publish the values */ - OBJ_CONSTRUCT(&values, opal_list_t); - pinfo = OBJ_NEW(opal_value_t); - pinfo->key = strdup(service_name); - pinfo->type = OPAL_STRING; - pinfo->data.string = strdup(port_name); - opal_list_append(&values, &pinfo->super); + /* publish the service name */ + rng = OBJ_NEW(opal_value_t); + rng->key = strdup(service_name); + rng->type = OPAL_STRING; + rng->data.string = strdup(port_name); + opal_list_append(&values, &rng->super); - rc = opal_pmix.publish(rng, persist, &values); + rc = opal_pmix.publish(&values); OPAL_LIST_DESTRUCT(&values); OPAL_CR_EXIT_LIBRARY(); diff --git a/ompi/mpi/c/unpublish_name.c b/ompi/mpi/c/unpublish_name.c index 2b60c04744..856fd22293 100644 --- a/ompi/mpi/c/unpublish_name.c +++ b/ompi/mpi/c/unpublish_name.c @@ -49,8 +49,8 @@ int MPI_Unpublish_name(const char *service_name, MPI_Info info, int rc; char range[OPAL_MAX_INFO_VAL]; int flag=0; - opal_pmix_data_range_t rng; - bool range_given = false; + opal_list_t pinfo; + opal_value_t *rng; char **keys = NULL; if ( MPI_PARAM_CHECK ) { @@ -71,34 +71,40 @@ int MPI_Unpublish_name(const char *service_name, MPI_Info info, } OPAL_CR_ENTER_LIBRARY(); + OBJ_CONSTRUCT(&pinfo, opal_list_t); /* OMPI supports info keys to pass the range to * be searched for the given key */ if (MPI_INFO_NULL != info) { ompi_info_get (info, "range", sizeof(range) - 1, range, &flag); if (flag) { - range_given = true; if (0 == strcmp(range, "nspace")) { - rng = OPAL_PMIX_NAMESPACE; // share only with procs in same nspace + rng = OBJ_NEW(opal_value_t); + rng->key = strdup(OPAL_PMIX_RANGE); + rng->type = OPAL_INT; + rng->data.integer = OPAL_PMIX_NAMESPACE; // share only with procs in same nspace + opal_list_append(&pinfo, &rng->super); } else if (0 == strcmp(range, "session")) { - rng = OPAL_PMIX_SESSION; // share only with procs in same session + rng = OBJ_NEW(opal_value_t); + rng->key = strdup(OPAL_PMIX_RANGE); + rng->type = OPAL_INT; + rng->data.integer = OPAL_PMIX_SESSION; // share only with procs in same session + opal_list_append(&pinfo, &rng->super); } else { - /* unrecognized range */ + /* unrecognized scope */ + OPAL_LIST_DESTRUCT(&pinfo); return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_ARG, FUNC_NAME); } } } - if (!range_given) { - /* default to nspace */ - rng = OPAL_PMIX_NAMESPACE; - } /* unpublish the service_name */ opal_argv_append_nosize(&keys, service_name); - rc = opal_pmix.unpublish(rng, keys); + rc = opal_pmix.unpublish(keys, &pinfo); opal_argv_free(keys); + OPAL_LIST_DESTRUCT(&pinfo); if ( OPAL_SUCCESS != rc ) { if (OPAL_ERR_NOT_FOUND == rc) { diff --git a/opal/mca/pmix/pmix.h b/opal/mca/pmix/pmix.h index b525ad2529..46b207055a 100644 --- a/opal/mca/pmix/pmix.h +++ b/opal/mca/pmix/pmix.h @@ -325,12 +325,8 @@ typedef int (*opal_pmix_base_module_get_nb_fn_t)(const opal_process_name_t *proc * data has been posted and is available. The non-blocking form will * return immediately, executing the callback when the server confirms * availability of the data */ -typedef int (*opal_pmix_base_module_publish_fn_t)(opal_pmix_data_range_t scope, - opal_pmix_persistence_t persist, - opal_list_t *info); -typedef int (*opal_pmix_base_module_publish_nb_fn_t)(opal_pmix_data_range_t scope, - opal_pmix_persistence_t persist, - opal_list_t *info, +typedef int (*opal_pmix_base_module_publish_fn_t)(opal_list_t *info); +typedef int (*opal_pmix_base_module_publish_nb_fn_t)(opal_list_t *info, opal_pmix_op_cbfunc_t cbfunc, void *cbdata); /* Lookup information published by another process within the @@ -352,8 +348,8 @@ typedef int (*opal_pmix_base_module_publish_nb_fn_t)(opal_pmix_data_range_t scop * and return any found items. Thus, the caller is responsible for * ensuring that data is published prior to executing a lookup, or * for retrying until the requested data is found */ -typedef int (*opal_pmix_base_module_lookup_fn_t)(opal_pmix_data_range_t scope, - opal_list_t *data); +typedef int (*opal_pmix_base_module_lookup_fn_t)(opal_list_t *data, + opal_list_t *info); /* Non-blocking form of the _PMIx_Lookup_ function. Data for * the provided NULL-terminated keys array will be returned @@ -362,7 +358,7 @@ typedef int (*opal_pmix_base_module_lookup_fn_t)(opal_pmix_data_range_t scope, * wait for _all_ requested data before executing the callback * (_true_), or to callback once the server returns whatever * data is immediately available (_false_) */ -typedef int (*opal_pmix_base_module_lookup_nb_fn_t)(opal_pmix_data_range_t scope, int wait, char **keys, +typedef int (*opal_pmix_base_module_lookup_nb_fn_t)(char **keys, opal_list_t *info, opal_pmix_lookup_cbfunc_t cbfunc, void *cbdata); /* Unpublish data posted by this process using the given keys @@ -370,14 +366,14 @@ typedef int (*opal_pmix_base_module_lookup_nb_fn_t)(opal_pmix_data_range_t scope * the data has been removed by the server. A value of _NULL_ * for the keys parameter instructs the server to remove * _all_ data published by this process within the given scope */ -typedef int (*opal_pmix_base_module_unpublish_fn_t)(opal_pmix_data_range_t scope, char **keys); +typedef int (*opal_pmix_base_module_unpublish_fn_t)(char **keys, opal_list_t *info); /* Non-blocking form of the _PMIx_Unpublish_ function. The * callback function will be executed once the server confirms * removal of the specified data. A value of _NULL_ * for the keys parameter instructs the server to remove * _all_ data published by this process within the given scope */ -typedef int (*opal_pmix_base_module_unpublish_nb_fn_t)(opal_pmix_data_range_t scope, char **keys, +typedef int (*opal_pmix_base_module_unpublish_nb_fn_t)(char **keys, opal_list_t *info, opal_pmix_op_cbfunc_t cbfunc, void *cbdata); /* Spawn a new job. The spawned applications are automatically diff --git a/opal/mca/pmix/pmix1xx/pmix/VERSION b/opal/mca/pmix/pmix1xx/pmix/VERSION index ea4a1fcfb9..8c8c2f0550 100644 --- a/opal/mca/pmix/pmix1xx/pmix/VERSION +++ b/opal/mca/pmix/pmix1xx/pmix/VERSION @@ -30,7 +30,7 @@ greek=a1 # command, or with the date (if "git describe" fails) in the form of # "date". -repo_rev=git51479b0 +repo_rev=git6afbc98 # If tarball_version is not empty, it is used as the version string in # the tarball filename, regardless of all other versions listed in @@ -44,7 +44,7 @@ tarball_version= # The date when this release was created -date="Sep 01, 2015" +date="Sep 04, 2015" # The shared library version of each of PMIx's public libraries. # These versions are maintained in accordance with the "Library diff --git a/opal/mca/pmix/pmix1xx/pmix/examples/pub.c b/opal/mca/pmix/pmix1xx/pmix/examples/pub.c index ab9517c276..e369eb3e09 100644 --- a/opal/mca/pmix/pmix1xx/pmix/examples/pub.c +++ b/opal/mca/pmix/pmix1xx/pmix/examples/pub.c @@ -77,7 +77,7 @@ int main(int argc, char **argv) (void)strncpy(info[1].key, "PANDA", PMIX_MAX_KEYLEN); info[1].value.type = PMIX_SIZE; info[1].value.data.size = 123456; - if (PMIX_SUCCESS != (rc = PMIx_Publish(PMIX_NAMESPACE, PMIX_PERSIST_APP, info, 2))) { + if (PMIX_SUCCESS != (rc = PMIx_Publish(info, 2))) { fprintf(stderr, "Client ns %s rank %d: PMIx_Publish failed: %d\n", myproc.nspace, myproc.rank, rc); goto done; } @@ -95,7 +95,7 @@ int main(int argc, char **argv) if (0 != myproc.rank) { PMIX_PDATA_CREATE(pdata, 1); (void)strncpy(pdata[0].key, "FOOBAR", PMIX_MAX_KEYLEN); - if (PMIX_SUCCESS != (rc = PMIx_Lookup(PMIX_NAMESPACE, NULL, 0, pdata, 1))) { + if (PMIX_SUCCESS != (rc = PMIx_Lookup(pdata, 1, NULL, 0))) { fprintf(stderr, "Client ns %s rank %d: PMIx_Lookup failed: %d\n", myproc.nspace, myproc.rank, rc); goto done; } @@ -137,7 +137,7 @@ int main(int argc, char **argv) keys[1] = "PANDA"; keys[2] = NULL; - if (PMIX_SUCCESS != (rc = PMIx_Unpublish(PMIX_NAMESPACE, keys))) { + if (PMIX_SUCCESS != (rc = PMIx_Unpublish(keys, NULL, 0))) { fprintf(stderr, "Client ns %s rank %d: PMIx_Unpublish failed: %d\n", myproc.nspace, myproc.rank, rc); free(keys); goto done; diff --git a/opal/mca/pmix/pmix1xx/pmix/examples/server.c b/opal/mca/pmix/pmix1xx/pmix/examples/server.c index 35e2c182d5..388384dd82 100644 --- a/opal/mca/pmix/pmix1xx/pmix/examples/server.c +++ b/opal/mca/pmix/pmix1xx/pmix/examples/server.c @@ -52,15 +52,13 @@ static int dmodex_fn(const pmix_proc_t *proc, const pmix_info_t info[], size_t ninfo, pmix_modex_cbfunc_t cbfunc, void *cbdata); static int publish_fn(const pmix_proc_t *proc, - pmix_data_range_t scope, pmix_persistence_t persist, const pmix_info_t info[], size_t ninfo, pmix_op_cbfunc_t cbfunc, void *cbdata); -static int lookup_fn(const pmix_proc_t *proc, - pmix_data_range_t scope, - const pmix_info_t info[], size_t ninfo, char **keys, +static int lookup_fn(const pmix_proc_t *proc, char **keys, + const pmix_info_t info[], size_t ninfo, pmix_lookup_cbfunc_t cbfunc, void *cbdata); -static int unpublish_fn(const pmix_proc_t *proc, - pmix_data_range_t scope, char **keys, +static int unpublish_fn(const pmix_proc_t *proc, char **keys, + const pmix_info_t info[], size_t ninfo, pmix_op_cbfunc_t cbfunc, void *cbdata); static int spawn_fn(const pmix_proc_t *proc, const pmix_info_t job_info[], size_t ninfo, @@ -443,7 +441,6 @@ static int dmodex_fn(const pmix_proc_t *proc, static int publish_fn(const pmix_proc_t *proc, - pmix_data_range_t scope, pmix_persistence_t persist, const pmix_info_t info[], size_t ninfo, pmix_op_cbfunc_t cbfunc, void *cbdata) { @@ -467,9 +464,8 @@ static int publish_fn(const pmix_proc_t *proc, } -static int lookup_fn(const pmix_proc_t *proc, - pmix_data_range_t scope, - const pmix_info_t info[], size_t ninfo, char **keys, +static int lookup_fn(const pmix_proc_t *proc, char **keys, + const pmix_info_t info[], size_t ninfo, pmix_lookup_cbfunc_t cbfunc, void *cbdata) { pmix_locdat_t *p, *p2; @@ -517,8 +513,8 @@ static int lookup_fn(const pmix_proc_t *proc, } -static int unpublish_fn(const pmix_proc_t *proc, - pmix_data_range_t scope, char **keys, +static int unpublish_fn(const pmix_proc_t *proc, char **keys, + const pmix_info_t info[], size_t ninfo, pmix_op_cbfunc_t cbfunc, void *cbdata) { pmix_locdat_t *p, *p2; diff --git a/opal/mca/pmix/pmix1xx/pmix/include/pmix.h b/opal/mca/pmix/pmix1xx/pmix/include/pmix.h index e5a18cdd77..26e4d190ae 100644 --- a/opal/mca/pmix/pmix1xx/pmix/include/pmix.h +++ b/opal/mca/pmix/pmix1xx/pmix/include/pmix.h @@ -65,27 +65,24 @@ BEGIN_C_DECLS /**** PMIX API ****/ -/* NOTE: calls to these APIs must be thread-protected as there - * currently is NO internal thread safety. */ - -/* Initialize the PMIx client, returning the namespace assigned - * to this client's application in the provided character array - * (must be of size PMIX_MAX_NSLEN or greater). Passing a parameter - * of _NULL_ for either or both parameters is allowed if the user +/* Initialize the PMIx client, returning the process identifier assigned + * to this client's application in the provided pmix_proc_t struct. + * Passing a parameter of _NULL_ for this parameter is allowed if the user * wishes solely to initialize the PMIx system and does not require - * return of the NULL parameter(s) at that time. + * return of the identifier at that time. * * When called the PMIx client will check for the required connection * information of the local PMIx server and will establish the connection. * If the information is not found, or the server connection fails, then * an appropriate error constant will be returned. * - * If successful, the function will return PMIX_SUCCESS, will fill the - * provided namespace array with the server-assigned namespace, and return - * the rank of the process within the application. Note that the PMIx - * client library is referenced counted, and so multiple calls to PMIx_Init - * are allowed. Thus, one way to obtain the namespace and rank of the - * process is to simply call PMIx_Init with non-NULL parameters. */ + * If successful, the function will return PMIX_SUCCESS and will fill the + * provided structure with the server-assigned namespace and rank of the + * process within the application. + * + * Note that the PMIx client library is referenced counted, and so multiple + * calls to PMIx_Init are allowed. Thus, one way to obtain the namespace and + * rank of the process is to simply call PMIx_Init with a non-NULL parameter. */ pmix_status_t PMIx_Init(pmix_proc_t *proc); /* Finalize the PMIx client, closing the connection to the local server. @@ -116,12 +113,18 @@ int PMIx_Initialized(void); * Passing a _NULL_ msg parameter is allowed. Note that race conditions * caused by multiple processes calling PMIx_Abort are left to the * server implementation to resolve with regard to which status is - * returned and what messages (if any) are printed. - */ + * returned and what messages (if any) are printed. */ pmix_status_t PMIx_Abort(int status, const char msg[], pmix_proc_t procs[], size_t nprocs); +/* Push a value into the client's namespace. The client library will cache + * the information locally until _PMIx_Commit_ is called. The provided scope + * value is passed to the local PMIx server, which will distribute the data + * as directed. */ +pmix_status_t PMIx_Put(pmix_scope_t scope, const char key[], pmix_value_t *val); + + /* Push all previously _PMIx_Put_ values to the local PMIx server. * This is an asynchronous operation - the library will immediately * return to the caller while the data is transmitted to the local @@ -132,7 +135,7 @@ pmix_status_t PMIx_Commit(void); /* Execute a blocking barrier across the processes identified in the * specified array. Passing a _NULL_ pointer as the _procs_ parameter * indicates that the barrier is to span all processes in the client's - * namespace. Each provided proc struct can pass PMIX_RANK_WILDCARD to + * namespace. Each provided pmix_proc_t struct can pass PMIX_RANK_WILDCARD to * indicate that all processes in the given namespace are * participating. * @@ -144,19 +147,22 @@ pmix_status_t PMIx_Commit(void); * A value of _false_ indicates that the callback is just used as a release * and no data is to be returned at that time. A value of _true_ indicates * that all _put_ data is to be collected by the barrier. Returned data is - * locally cached so that subsequent calls to _PMIx_Get_ can be serviced - * without communicating to/from the server, but at the cost of increased - * memory footprint + * cached at the server to reduce memory footprint, and can be retrieved + * as needed by calls to PMIx_Get(nb). + * + * Note that for scalability reasons, the default behavior for PMIx_Fence + * is to _not_ collect the data. * * (b) PMIX_COLLECTIVE_ALGO - a comma-delimited string indicating the algos - * to be used for executing the barrier, in priority order. The _mandatory_ - * flag can instruct the host RM that it should return an error if none - * of the provided algos are available. Otherwise, the RM is to use one - * of the algos if possible, but is otherwise free to use any of its - * available methods to execute the operation. + * to be used for executing the barrier, in priority order. * - * (c) PMIX_TIMEOUT - maximum time for the fence to execute before declaring - * an error. The RM shall terminate the operation and notify participants + * (c) PMIX_COLLECTIVE_ALGO_REQD - instructs the host RM that it should return + * an error if none of the specified algos are available. Otherwise, the RM + * is to use one of the algos if possible, but is otherwise free to use any + * of its available methods to execute the operation. + * + * (d) PMIX_TIMEOUT - maximum time for the fence to execute before declaring + * an error. By default, the RM shall terminate the operation and notify participants * if one or more of the indicated procs fails during the fence. However, * the timeout parameter can help avoid "hangs" due to programming errors * that prevent one or more procs from reaching the "fence". @@ -164,7 +170,6 @@ pmix_status_t PMIx_Commit(void); pmix_status_t PMIx_Fence(const pmix_proc_t procs[], size_t nprocs, const pmix_info_t info[], size_t ninfo); -/* Fence_nb */ /* Non-blocking version of PMIx_Fence. Note that the function will return * an error if a _NULL_ callback function is given. */ pmix_status_t PMIx_Fence_nb(const pmix_proc_t procs[], size_t nprocs, @@ -172,16 +177,9 @@ pmix_status_t PMIx_Fence_nb(const pmix_proc_t procs[], size_t nprocs, pmix_op_cbfunc_t cbfunc, void *cbdata); -/* Push a value into the client's namespace. The client library will cache - * the information locally until _PMIx_Commit_ is called. The provided scope - * value is passed to the local PMIx server, which will distribute the data - * as directed. */ -pmix_status_t PMIx_Put(pmix_scope_t scope, const char key[], pmix_value_t *val); - -/* Retrieve information for the specified _key_ as published by the given _rank_ - * within the provided _namespace_, returning a pointer to the value in the - * given address. A _NULL_ value for the namespace indicates that the rank - * is within the caller's namespace. +/* Retrieve information for the specified _key_ as published by the process + * identified in the given pmix_proc_t, returning a pointer to the value in the + * given address. * * This is a blocking operation - the caller will block until * the specified data has been _PMIx_Put_ by the specified rank. The caller is @@ -199,55 +197,48 @@ pmix_status_t PMIx_Get(const pmix_proc_t *proc, const char key[], const pmix_info_t info[], size_t ninfo, pmix_value_t **val); -/* Retrieve information for the specified _key_ as _PMIx_Put_ by the given _rank_ - * within the provided _namespace_. This is a non-blocking operation - the - * callback function will be executed once the specified data has been _PMIx_Put_ - * by the specified rank and retrieved by the local server. The info +/* A non-blocking operation version of PMIx_Get - the callback function will + * be executed once the specified data has been _PMIx_Put_ + * by the identified process and retrieved by the local server. The info * array is used as described above for the blocking form of this call. */ pmix_status_t PMIx_Get_nb(const pmix_proc_t *proc, const char key[], const pmix_info_t info[], size_t ninfo, pmix_value_cbfunc_t cbfunc, void *cbdata); -/* Publish the data in the info array for lookup subject to the provided - * data range. Note that the keys must be unique within the specified +/* Publish the data in the info array for lookup. By default, + * the data will be published into the PMIX_SESSION range and + * with PMIX_PERSIST_APP persistence. Changes to those values, + * and any additional directives, can be included in the pmix_info_t + * array. + * + * Note that the keys must be unique within the specified * data range or else an error will be returned (first published * wins). Attempts to access the data by procs outside of * the provided data range will be rejected. * - * Note: Some host environments may support user/group level - * access controls on the information in addition to the data range. - * These can be specified in the info array using the appropriately - * defined keys. - * * The persistence parameter instructs the server as to how long - * the data is to be retained, within the context of the range. - * For example, data published within _PMIX_NAMESPACE_ will be - * deleted along with the namespace regardless of the persistence. - * However, data published within PMIX_USER would be retained if - * the persistence was set to _PMIX_PERSIST_SESSION_ until the - * allocation terminates. + * the data is to be retained. * * The blocking form will block until the server confirms that the * data has been posted and is available. The non-blocking form will * return immediately, executing the callback when the server confirms * availability of the data. */ -pmix_status_t PMIx_Publish(pmix_data_range_t range, - pmix_persistence_t persist, - const pmix_info_t info[], - size_t ninfo); -pmix_status_t PMIx_Publish_nb(pmix_data_range_t range, - pmix_persistence_t persist, - const pmix_info_t info[], - size_t ninfo, +pmix_status_t PMIx_Publish(const pmix_info_t info[], size_t ninfo); +pmix_status_t PMIx_Publish_nb(const pmix_info_t info[], size_t ninfo, pmix_op_cbfunc_t cbfunc, void *cbdata); -/* Lookup information published by another process within the - * specified range. A rabge of _PMIX_DATA_RANGE_UNDEF_ requests that - * the search be conducted across _all_ namespaces accessible by this - * user. +/* Lookup information published by this or another process. By default, + * the search will be conducted across the PMIX_SESSION range. Changes + * to the range, and any additional directives, can be provided + * in the pmix_info_t array. Note that the search is also constrained + * to only data published by the current user ID - i.e., the search + * will not return data published by an application being executed + * by another user. There currently is no option to override this + * behavior - such an option may become available later via an + * appropriate pmix_info_t directive. * * The "data" parameter consists of an array of pmix_pdata_t struct with the * keys specifying the requested information. Data will be returned @@ -270,18 +261,13 @@ pmix_status_t PMIx_Publish_nb(pmix_data_range_t range, * by including: * * (a) PMIX_WAIT - wait for the requested data to be published. The - * _mandatory_ flag indicates that the server is to wait until - * all data has become available. Otherwise, the function will - * return as soon as the specified number of values have been - * collected. A value of -1 indicates that all values must be - * obtained. + * server is to wait until all data has become available. * * (b) PMIX_TIMEOUT - max time to wait for data to become available. * */ -pmix_status_t PMIx_Lookup(pmix_data_range_t range, - const pmix_info_t info[], size_t ninfo, - pmix_pdata_t data[], size_t ndata); +pmix_status_t PMIx_Lookup(pmix_pdata_t data[], size_t ndata, + const pmix_info_t info[], size_t ninfo); /* Non-blocking form of the _PMIx_Lookup_ function. Data for * the provided NULL-terminated keys array will be returned @@ -289,44 +275,58 @@ pmix_status_t PMIx_Lookup(pmix_data_range_t range, * behavior is to _not_ wait for data to be published. The * info keys can be used to modify the behavior as previously * described */ -pmix_status_t PMIx_Lookup_nb(pmix_data_range_t range, char **keys, - const pmix_info_t info[], size_t ninfo, +pmix_status_t PMIx_Lookup_nb(char **keys, const pmix_info_t info[], size_t ninfo, pmix_lookup_cbfunc_t cbfunc, void *cbdata); -/* Unpublish data posted by this process using the given keys - * within the specified data range. The function will block until - * the data has been removed by the server. A value of _NULL_ - * for the keys parameter instructs the server to remove - * _all_ data published by this process within the given range */ -pmix_status_t PMIx_Unpublish(pmix_data_range_t range, char **keys); +/* Unpublish data posted by this process using the given keys. + * The function will block until the data has been removed by + * the server. A value of _NULL_ for the keys parameter instructs + * the server to remove _all_ data published by this process. + * + * By default, the range is assumed to be PMIX_SESSION. Changes + * to the range, and any additional directives, can be provided + * in the pmix_info_t array */ +pmix_status_t PMIx_Unpublish(char **keys, + const pmix_info_t info[], size_t ninfo); /* Non-blocking form of the _PMIx_Unpublish_ function. The * callback function will be executed once the server confirms - * removal of the specified data. A value of _NULL_ - * for the keys parameter instructs the server to remove - * _all_ data published by this process within the given range */ -pmix_status_t PMIx_Unpublish_nb(pmix_data_range_t range, char **keys, + * removal of the specified data. */ +pmix_status_t PMIx_Unpublish_nb(char **keys, + const pmix_info_t info[], size_t ninfo, pmix_op_cbfunc_t cbfunc, void *cbdata); /* Spawn a new job. The assigned namespace of the spawned applications * is returned in the nspace parameter - a _NULL_ value in that * location indicates that the caller doesn't wish to have the - * namespace returned. Behavior of individual resource managers + * namespace returned. The nspace array must be at least of size + * PMIX_MAX_NSLEN+1. Behavior of individual resource managers * may differ, but it is expected that failure of any application * process to start will result in termination/cleanup of _all_ * processes in the newly spawned job and return of an error * code to the caller. * + * By default, the spawned processes will be PMIx "connected" to + * the parent process upon successful launch (see PMIx_Connect + * description for details). Note that this only means that the + * parent process (a) will be given a copy of the new job's + * information so it can query job-level info without + * incurring any communication penalties, and (b) will receive + * notification of errors from process in the child job. + * * Job-level directives can be specified in the job_info array. This * can include: * - * (a) PMIX_NON_MPI - the spawned job is not an MPI job and the procs will + * (a) PMIX_NON_PMI - processes in the spawned job will * not be calling PMIx_Init * * (b) PMIX_TIMEOUT - declare the spawn as having failed if the launched * procs do not call PMIx_Init within the specified time + * + * (c) PMIX_NOTIFY_COMPLETION - notify the parent process when the + * child job terminates, either normally or with error */ pmix_status_t PMIx_Spawn(const pmix_info_t job_info[], size_t ninfo, const pmix_app_t apps[], size_t napps, @@ -353,8 +353,8 @@ pmix_status_t PMIx_Spawn_nb(const pmix_info_t job_info[], size_t ninfo, * the job-level info from those nspaces other than their own. * * Note: a process can only engage in _one_ connect operation involving the identical - * set of ranges at a time. However, a process _can_ be simultaneously engaged - * in multiple connect operations, each involving a different set of ranges + * set of processes at a time. However, a process _can_ be simultaneously engaged + * in multiple connect operations, each involving a different set of processes * * As in the case of the fence operation, the info array can be used to pass * user-level directives regarding the algorithm to be used for the collective diff --git a/opal/mca/pmix/pmix1xx/pmix/include/pmix/pmix_common.h.in b/opal/mca/pmix/pmix1xx/pmix/include/pmix/pmix_common.h.in index fa2ece0f4f..7af614c21c 100644 --- a/opal/mca/pmix/pmix1xx/pmix/include/pmix/pmix_common.h.in +++ b/opal/mca/pmix/pmix1xx/pmix/include/pmix/pmix_common.h.in @@ -163,6 +163,9 @@ BEGIN_C_DECLS #define PMIX_WAIT "pmix.wait" // (int) caller requests that the server wait until the specified #values are found #define PMIX_COLLECTIVE_ALGO "pmix.calgo" // (char*) comma-delimited list of algorithms to use for collective #define PMIX_COLLECTIVE_ALGO_REQD "pmix.calreqd" // (bool) if true, indicates that the requested choice of algo is mandatory +#define PMIX_NOTIFY_COMPLETION "pmix.notecomp" // (bool) notify parent process upon termination of child job +#define PMIX_RANGE "pmix.range" // (int) pmix_data_range_t value for calls to publish/lookup/unpublish +#define PMIX_PERSISTENCE "pmix.persist" // (int) pmix_persistence_t value for calls to publish /* attributes used by host server to pass data to the server convenience library - the * data will then be parsed and provided to the local clients */ diff --git a/opal/mca/pmix/pmix1xx/pmix/include/pmix_server.h b/opal/mca/pmix/pmix1xx/pmix/include/pmix_server.h index 81da17fafb..8e4e506c84 100644 --- a/opal/mca/pmix/pmix1xx/pmix/include/pmix_server.h +++ b/opal/mca/pmix/pmix1xx/pmix/include/pmix_server.h @@ -103,7 +103,7 @@ BEGIN_C_DECLS typedef int (*pmix_server_client_connected_fn_t)(const pmix_proc_t *proc, void* server_object); -/* Notify the host server that a client called PMIx_Finalize- note +/* Notify the host server that a client called PMIx_Finalize - note * that the client will be in a blocked state until the host server * executes the callback function, thus allowing the PMIx server support * library to release the client */ @@ -158,49 +158,46 @@ typedef pmix_status_t (*pmix_server_dmodex_req_fn_t)(const pmix_proc_t *proc, /* Publish data per the PMIx API specification. The callback is to be executed - * upon completion of the operation. The host server is not required to guarantee - * support for the requested range - i.e., the server does not need to return an - * error if the data store doesn't support range-based isolation. However, the - * server must return an error (a) if the key is duplicative within the storage - * range, and (b) if the server does not allow overwriting of published info by - * the original publisher - it is left to the discretion of the host server to - * allow info-key-based flags to modify this behavior. The persist flag indicates - * how long the server should retain the data. The nspace/rank of the publishing - * process is also provided and is expected to be returned on any subsequent - * lookup request */ + * upon completion of the operation. The default data range is expected to be + * PMIX_SESSION, and the default persistence PMIX_PERSIST_SESSION. These values + * can be modified by including the respective pmix_info_t struct in the + * provided array. + * + * Note that the host server is not required to guarantee support for any specific + * range - i.e., the server does not need to return an error if the data store + * doesn't support range-based isolation. However, the server must return an error + * (a) if the key is duplicative within the storage range, and (b) if the server + * does not allow overwriting of published info by the original publisher - it is + * left to the discretion of the host server to allow info-key-based flags to modify + * this behavior. + * + * The persistence indicates how long the server should retain the data. + * + * The identifier of the publishing process is also provided and is expected to + * be returned on any subsequent lookup request */ typedef pmix_status_t (*pmix_server_publish_fn_t)(const pmix_proc_t *proc, - pmix_data_range_t range, pmix_persistence_t persist, const pmix_info_t info[], size_t ninfo, pmix_op_cbfunc_t cbfunc, void *cbdata); /* Lookup published data. The host server will be passed a NULL-terminated array - * of string keys along with the range within which the data is expected to have - * been published. The host server is not required to guarantee support for all - * PMIx-defined ranges, but should only search data stores within the specified - * range within the context of the corresponding "publish" API. + * of string keys. * * The array of info structs is used to pass user-requested options to the server. * This can include a wait flag to indicate that the server should wait for all * data to become available before executing the callback function, or should * immediately callback with whatever data is available. In addition, a timeout * can be specified on the wait to preclude an indefinite wait for data that - * may never be published. The directives are optional _unless_ the _mandatory_ flag - * has been set - in such cases, the host RM is required to return an error - * if the directive cannot be met. */ -typedef pmix_status_t (*pmix_server_lookup_fn_t)(const pmix_proc_t *proc, - pmix_data_range_t range, + * may never be published. */ +typedef pmix_status_t (*pmix_server_lookup_fn_t)(const pmix_proc_t *proc, char **keys, const pmix_info_t info[], size_t ninfo, - char **keys, pmix_lookup_cbfunc_t cbfunc, void *cbdata); /* Delete data from the data store. The host server will be passed a NULL-terminated array - * of string keys along with the range within which the data is expected to have - * been published. The callback is to be executed upon completion of the delete + * of string keys, plus potential directives such as the data range within which the + * keys should be deleted. The callback is to be executed upon completion of the delete * procedure */ -typedef pmix_status_t (*pmix_server_unpublish_fn_t)(const pmix_proc_t *proc, - pmix_data_range_t range, +typedef pmix_status_t (*pmix_server_unpublish_fn_t)(const pmix_proc_t *proc, char **keys, const pmix_info_t info[], size_t ninfo, - char **keys, pmix_op_cbfunc_t cbfunc, void *cbdata); /* Spawn a set of applications/processes as per the PMIx API. Note that diff --git a/opal/mca/pmix/pmix1xx/pmix/src/client/pmi1.c b/opal/mca/pmix/pmix1xx/pmix/src/client/pmi1.c index 68357164f6..398f13c955 100644 --- a/opal/mca/pmix/pmix1xx/pmix/src/client/pmi1.c +++ b/opal/mca/pmix/pmix1xx/pmix/src/client/pmi1.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2014 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2015 Intel, Inc. All rights reserved. * Copyright (c) 2014 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ @@ -190,7 +190,7 @@ int PMI_Publish_name(const char service_name[], const char port[]) /* publish the info - PMI-1 doesn't support * any scope other than inside our own nspace */ - rc = PMIx_Publish(PMIX_NAMESPACE, PMIX_PERSIST_APP, &info, 1); + rc = PMIx_Publish(&info, 1); return convert_err(rc); } @@ -204,7 +204,7 @@ int PMI_Unpublish_name(const char service_name[]) keys[0] = (char*)service_name; keys[1] = NULL; - rc = PMIx_Unpublish(PMIX_NAMESPACE, keys); + rc = PMIx_Unpublish(keys, NULL, 0); return convert_err(rc); } @@ -219,7 +219,7 @@ int PMI_Lookup_name(const char service_name[], char port[]) (void)strncpy(pdata.key, service_name, PMIX_MAX_KEYLEN); /* PMI-1 doesn't want the nspace back */ - if (PMIX_SUCCESS != (rc = PMIx_Lookup(PMIX_NAMESPACE, NULL, 0, &pdata, 1))) { + if (PMIX_SUCCESS != (rc = PMIx_Lookup(&pdata, 1, NULL, 0))) { return convert_err(rc); } diff --git a/opal/mca/pmix/pmix1xx/pmix/src/client/pmi2.c b/opal/mca/pmix/pmix1xx/pmix/src/client/pmi2.c index f8a7adccfc..1ebcc9ae1d 100644 --- a/opal/mca/pmix/pmix1xx/pmix/src/client/pmi2.c +++ b/opal/mca/pmix/pmix1xx/pmix/src/client/pmi2.c @@ -240,7 +240,7 @@ int PMI2_Nameserv_publish(const char service_name[], const PMI_keyval_t *info_pt } /* publish the info - PMI-2 doesn't support * any scope other than inside our own nspace */ - rc = PMIx_Publish(PMIX_NAMESPACE, PMIX_PERSIST_APP, info, nvals); + rc = PMIx_Publish(info, nvals); return convert_err(rc); } @@ -261,7 +261,7 @@ int PMI2_Nameserv_unpublish(const char service_name[], keys[1] = info_ptr->key; } - rc = PMIx_Unpublish(PMIX_NAMESPACE, keys); + rc = PMIx_Unpublish(keys, NULL, 0); return convert_err(rc); } @@ -288,7 +288,7 @@ int PMI2_Nameserv_lookup(const char service_name[], const PMI_keyval_t *info_ptr } /* lookup the info */ - if (PMIX_SUCCESS != (rc = PMIx_Lookup(PMIX_NAMESPACE, NULL, 0, pdata, nvals))) { + if (PMIX_SUCCESS != (rc = PMIx_Lookup(pdata, nvals, NULL, 0))) { PMIX_PDATA_DESTRUCT(&pdata[0]); PMIX_PDATA_DESTRUCT(&pdata[1]); return convert_err(rc); diff --git a/opal/mca/pmix/pmix1xx/pmix/src/client/pmix_client.c b/opal/mca/pmix/pmix1xx/pmix/src/client/pmix_client.c index 29e1f733c5..2c60fbf7b3 100644 --- a/opal/mca/pmix/pmix1xx/pmix/src/client/pmix_client.c +++ b/opal/mca/pmix/pmix1xx/pmix/src/client/pmix_client.c @@ -149,6 +149,7 @@ static void wait_cbfunc(struct pmix_peer_t *pr, pmix_usock_hdr_t *hdr, cb->active = false; } +/* callback to receive job info */ static void job_data(struct pmix_peer_t *pr, pmix_usock_hdr_t *hdr, pmix_buffer_t *buf, void *cbdata) { @@ -165,6 +166,7 @@ static void job_data(struct pmix_peer_t *pr, pmix_usock_hdr_t *hdr, } /* decode it */ pmix_client_process_nspace_blob(pmix_globals.myid.nspace, buf); + cb->status = PMIX_SUCCESS; cb->active = false; } @@ -247,6 +249,8 @@ int PMIx_Init(pmix_proc_t *proc) /* get our effective id's */ pmix_globals.uid = geteuid(); pmix_globals.gid = getegid(); + /* default to our internal errhandler */ + pmix_globals.errhandler = myerrhandler; /* initialize the output system */ if (!pmix_output_init()) { @@ -323,17 +327,19 @@ int PMIx_Init(pmix_proc_t *proc) return -1; } - /* connect to the server - returns job info if successful */ + /* setup an object to track server connection */ PMIX_CONSTRUCT(&cb, pmix_cb_t); cb.active = true; + /* connect to the server - returns job info if successful */ if (PMIX_SUCCESS != (rc = connect_to_server(&address, &cb))){ PMIX_DESTRUCT(&cb); return rc; } PMIX_WAIT_FOR_COMPLETION(cb.active); + rc = cb.status; PMIX_DESTRUCT(&cb); - return PMIX_SUCCESS; + return rc; } int PMIx_Initialized(void) @@ -769,10 +775,21 @@ static int recv_connect_ack(int sd) { int reply; int rc; + struct timeval tv, save; + pmix_socklen_t sz; pmix_output_verbose(2, pmix_globals.debug_output, "pmix: RECV CONNECT ACK FROM SERVER"); + /* get the current timeout value so we can reset to it */ + sz = sizeof(save); + getsockopt(sd, SOL_SOCKET, SO_RCVTIMEO, (void*)&save, &sz); + + /* set a timeout on the blocking recv so we don't hang */ + tv.tv_sec = 2; + tv.tv_usec = 0; + setsockopt(sd, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)); + /* receive the status reply */ rc = pmix_usock_recv_blocking(sd, (char*)&reply, sizeof(int)); if (PMIX_SUCCESS != rc) { @@ -802,6 +819,9 @@ static int recv_connect_ack(int sd) return rc; } + /* return the socket to normal */ + setsockopt(sd, SOL_SOCKET, SO_RCVTIMEO, &save, sz); + return PMIX_SUCCESS; } diff --git a/opal/mca/pmix/pmix1xx/pmix/src/client/pmix_client_pub.c b/opal/mca/pmix/pmix1xx/pmix/src/client/pmix_client_pub.c index 186a28c40f..b3103b1bd1 100644 --- a/opal/mca/pmix/pmix1xx/pmix/src/client/pmix_client_pub.c +++ b/opal/mca/pmix/pmix1xx/pmix/src/client/pmix_client_pub.c @@ -61,9 +61,7 @@ static void wait_lookup_cbfunc(struct pmix_peer_t *pr, pmix_usock_hdr_t *hdr, static void lookup_cbfunc(int status, pmix_pdata_t pdata[], size_t ndata, void *cbdata); -int PMIx_Publish(pmix_data_range_t scope, - pmix_persistence_t persist, - const pmix_info_t info[], +int PMIx_Publish(const pmix_info_t info[], size_t ninfo) { int rc; @@ -85,7 +83,7 @@ int PMIx_Publish(pmix_data_range_t scope, cb = PMIX_NEW(pmix_cb_t); cb->active = true; - if (PMIX_SUCCESS != (rc = PMIx_Publish_nb(scope, persist, info, ninfo, op_cbfunc, cb))) { + if (PMIX_SUCCESS != (rc = PMIx_Publish_nb(info, ninfo, op_cbfunc, cb))) { PMIX_ERROR_LOG(rc); PMIX_RELEASE(cb); return rc; @@ -99,10 +97,7 @@ int PMIx_Publish(pmix_data_range_t scope, return rc; } -int PMIx_Publish_nb(pmix_data_range_t scope, - pmix_persistence_t persist, - const pmix_info_t info[], - size_t ninfo, +int PMIx_Publish_nb(const pmix_info_t info[], size_t ninfo, pmix_op_cbfunc_t cbfunc, void *cbdata) { pmix_buffer_t *msg; @@ -143,24 +138,14 @@ int PMIx_Publish_nb(pmix_data_range_t scope, PMIX_RELEASE(msg); return rc; } - /* pack the data range */ - if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, &scope, 1, PMIX_DATA_RANGE))) { - PMIX_ERROR_LOG(rc); - PMIX_RELEASE(msg); - return rc; - } - /* pack the persistence */ - if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, &persist, 1, PMIX_PERSIST))) { - PMIX_ERROR_LOG(rc); - PMIX_RELEASE(msg); - return rc; - } - /* pack the info keys that were given */ + /* pass the number of info structs - needed on remote end so + * space can be malloc'd for the values */ if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, &ninfo, 1, PMIX_SIZE))) { PMIX_ERROR_LOG(rc); PMIX_RELEASE(msg); return rc; } + /* pack the info structs */ if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, info, ninfo, PMIX_INFO))) { PMIX_ERROR_LOG(rc); PMIX_RELEASE(msg); @@ -181,9 +166,8 @@ int PMIx_Publish_nb(pmix_data_range_t scope, return PMIX_SUCCESS; } -int PMIx_Lookup(pmix_data_range_t scope, - const pmix_info_t info[], size_t ninfo, - pmix_pdata_t pdata[], size_t ndata) +int PMIx_Lookup(pmix_pdata_t pdata[], size_t ndata, + const pmix_info_t info[], size_t ninfo) { int rc; pmix_cb_t *cb; @@ -213,8 +197,7 @@ int PMIx_Lookup(pmix_data_range_t scope, cb->nvals = ndata; cb->active = true; - if (PMIX_SUCCESS != (rc = PMIx_Lookup_nb(scope, keys, - info, ninfo, + if (PMIX_SUCCESS != (rc = PMIx_Lookup_nb(keys, info, ninfo, lookup_cbfunc, cb))) { PMIX_RELEASE(cb); pmix_argv_free(keys); @@ -231,8 +214,7 @@ int PMIx_Lookup(pmix_data_range_t scope, return rc; } -int PMIx_Lookup_nb(pmix_data_range_t range, char **keys, - const pmix_info_t info[], size_t ninfo, +int PMIx_Lookup_nb(char **keys, const pmix_info_t info[], size_t ninfo, pmix_lookup_cbfunc_t cbfunc, void *cbdata) { pmix_buffer_t *msg; @@ -267,25 +249,6 @@ int PMIx_Lookup_nb(pmix_data_range_t range, char **keys, PMIX_RELEASE(msg); return rc; } - /* pack the range */ - if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, &range, 1, PMIX_DATA_RANGE))) { - PMIX_ERROR_LOG(rc); - PMIX_RELEASE(msg); - return rc; - } - /* pack the info structs */ - if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, &ninfo, 1, PMIX_SIZE))) { - PMIX_ERROR_LOG(rc); - PMIX_RELEASE(msg); - return rc; - } - if (0 < ninfo) { - if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, info, ninfo, PMIX_INFO))) { - PMIX_ERROR_LOG(rc); - PMIX_RELEASE(msg); - return rc; - } - } /* pack the keys */ nkeys = pmix_argv_count(keys); if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, &nkeys, 1, PMIX_SIZE))) { @@ -302,6 +265,19 @@ int PMIx_Lookup_nb(pmix_data_range_t range, char **keys, } } } + /* pass the number of info structs - needed on remote end so + * space can be malloc'd for the values */ + if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, &ninfo, 1, PMIX_SIZE))) { + PMIX_ERROR_LOG(rc); + PMIX_RELEASE(msg); + return rc; + } + /* pack the info structs */ + if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, info, ninfo, PMIX_INFO))) { + PMIX_ERROR_LOG(rc); + PMIX_RELEASE(msg); + return rc; + } /* create a callback object as we need to pass it to the * recv routine so we know which callback to use when @@ -316,7 +292,7 @@ int PMIx_Lookup_nb(pmix_data_range_t range, char **keys, return PMIX_SUCCESS; } -int PMIx_Unpublish(pmix_data_range_t scope, char **keys) +int PMIx_Unpublish(char **keys, const pmix_info_t info[], size_t ninfo) { int rc; pmix_cb_t *cb; @@ -331,7 +307,7 @@ int PMIx_Unpublish(pmix_data_range_t scope, char **keys) cb->active = true; /* push the message into our event base to send to the server */ - if (PMIX_SUCCESS != (rc = PMIx_Unpublish_nb(scope, keys, op_cbfunc, cb))) { + if (PMIX_SUCCESS != (rc = PMIx_Unpublish_nb(keys, info, ninfo, op_cbfunc, cb))) { PMIX_RELEASE(cb); return rc; } @@ -344,7 +320,7 @@ int PMIx_Unpublish(pmix_data_range_t scope, char **keys) return rc; } -int PMIx_Unpublish_nb(pmix_data_range_t range, char **keys, +int PMIx_Unpublish_nb(char **keys, const pmix_info_t info[], size_t ninfo, pmix_op_cbfunc_t cbfunc, void *cbdata) { pmix_buffer_t *msg; @@ -374,12 +350,6 @@ int PMIx_Unpublish_nb(pmix_data_range_t range, char **keys, PMIX_RELEASE(msg); return rc; } - /* pack the range */ - if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, &range, 1, PMIX_DATA_RANGE))) { - PMIX_ERROR_LOG(rc); - PMIX_RELEASE(msg); - return rc; - } /* pack the number of keys */ i = pmix_argv_count(keys); if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, &i, 1, PMIX_SIZE))) { @@ -396,6 +366,19 @@ int PMIx_Unpublish_nb(pmix_data_range_t range, char **keys, } } } + /* pass the number of info structs - needed on remote end so + * space can be malloc'd for the values */ + if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, &ninfo, 1, PMIX_SIZE))) { + PMIX_ERROR_LOG(rc); + PMIX_RELEASE(msg); + return rc; + } + /* pack the info structs */ + if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, info, ninfo, PMIX_INFO))) { + PMIX_ERROR_LOG(rc); + PMIX_RELEASE(msg); + return rc; + } /* create a callback object */ cb = PMIX_NEW(pmix_cb_t); diff --git a/opal/mca/pmix/pmix1xx/pmix/src/server/pmix_server.c b/opal/mca/pmix/pmix1xx/pmix/src/server/pmix_server.c index 88c1070060..682c374e52 100644 --- a/opal/mca/pmix/pmix1xx/pmix/src/server/pmix_server.c +++ b/opal/mca/pmix/pmix1xx/pmix/src/server/pmix_server.c @@ -246,13 +246,16 @@ static pmix_status_t initialize_server_base(pmix_server_module_t *module) security_mode = strdup(pmix_sec.name); /* find the temp dir */ - if (NULL == (tdir = getenv("TMPDIR"))) { - if (NULL == (tdir = getenv("TEMP"))) { - if (NULL == (tdir = getenv("TMP"))) { - tdir = "/tmp"; + if (NULL == (tdir = getenv("PMIX_SERVER_TMPDIR"))) { + if (NULL == (tdir = getenv("TMPDIR"))) { + if (NULL == (tdir = getenv("TEMP"))) { + if (NULL == (tdir = getenv("TMP"))) { + tdir = "/tmp"; + } } } } + /* now set the address - we use the pid here to reduce collisions */ memset(&myaddress, 0, sizeof(struct sockaddr_un)); myaddress.sun_family = AF_UNIX; @@ -1879,7 +1882,7 @@ static void cnct_cbfunc(int status, void *cbdata) scd = PMIX_NEW(pmix_shift_caddy_t); scd->status = status; scd->tracker = tracker; - PMIX_THREADSHIFT(scd, _mdxcbfunc); + PMIX_THREADSHIFT(scd, _cnct); } diff --git a/opal/mca/pmix/pmix1xx/pmix/src/server/pmix_server_ops.c b/opal/mca/pmix/pmix1xx/pmix/src/server/pmix_server_ops.c index 0970a45e55..895be3180f 100644 --- a/opal/mca/pmix/pmix1xx/pmix/src/server/pmix_server_ops.c +++ b/opal/mca/pmix/pmix1xx/pmix/src/server/pmix_server_ops.c @@ -979,9 +979,7 @@ pmix_status_t pmix_server_publish(pmix_peer_t *peer, { pmix_status_t rc; int32_t cnt; - pmix_data_range_t range; - pmix_persistence_t persist; - size_t i, ninfo, einfo; + size_t ninfo, einfo; pmix_info_t *info = NULL; pmix_proc_t proc; uint32_t uid; @@ -999,18 +997,6 @@ pmix_status_t pmix_server_publish(pmix_peer_t *peer, PMIX_ERROR_LOG(rc); return rc; } - /* unpack the scope */ - cnt=1; - if (PMIX_SUCCESS != (rc = pmix_bfrop.unpack(buf, &range, &cnt, PMIX_DATA_RANGE))) { - PMIX_ERROR_LOG(rc); - return rc; - } - /* unpack the persistence */ - cnt=1; - if (PMIX_SUCCESS != (rc = pmix_bfrop.unpack(buf, &persist, &cnt, PMIX_PERSIST))) { - PMIX_ERROR_LOG(rc); - return rc; - } /* unpack the number of info objects */ cnt=1; if (PMIX_SUCCESS != (rc = pmix_bfrop.unpack(buf, &ninfo, &cnt, PMIX_SIZE))) { @@ -1035,7 +1021,8 @@ pmix_status_t pmix_server_publish(pmix_peer_t *peer, /* call the local server */ (void)strncpy(proc.nspace, peer->info->nptr->nspace, PMIX_MAX_NSLEN); proc.rank = peer->info->rank; - rc = pmix_host_server.publish(&proc, range, persist, info, einfo, cbfunc, cbdata); + pmix_output(0, "server passing %d values up", (int)einfo); + rc = pmix_host_server.publish(&proc, info, einfo, cbfunc, cbdata); cleanup: PMIX_INFO_FREE(info, einfo); @@ -1048,8 +1035,6 @@ pmix_status_t pmix_server_lookup(pmix_peer_t *peer, { int32_t cnt; pmix_status_t rc; - int wait; - pmix_data_range_t range; size_t nkeys, i; char **keys=NULL, *sptr; pmix_info_t *info = NULL; @@ -1070,12 +1055,22 @@ pmix_status_t pmix_server_lookup(pmix_peer_t *peer, PMIX_ERROR_LOG(rc); return rc; } - /* unpack the range */ + /* unpack the number of keys */ cnt=1; - if (PMIX_SUCCESS != (rc = pmix_bfrop.unpack(buf, &range, &cnt, PMIX_DATA_RANGE))) { - PMIX_ERROR_LOG(rc); + if (PMIX_SUCCESS != (rc = pmix_bfrop.unpack(buf, &nkeys, &cnt, PMIX_SIZE))) { + PMIX_ERROR_LOG(rc); return rc; } + /* unpack the array of keys */ + for (i=0; i < nkeys; i++) { + cnt=1; + if (PMIX_SUCCESS != (rc = pmix_bfrop.unpack(buf, &sptr, &cnt, PMIX_STRING))) { + PMIX_ERROR_LOG(rc); + goto cleanup; + } + pmix_argv_append_nosize(&keys, sptr); + free(sptr); + } /* unpack the number of info objects */ cnt=1; if (PMIX_SUCCESS != (rc = pmix_bfrop.unpack(buf, &ninfo, &cnt, PMIX_SIZE))) { @@ -1098,27 +1093,10 @@ pmix_status_t pmix_server_lookup(pmix_peer_t *peer, info[einfo-1].value.type = PMIX_UINT32; info[einfo-1].value.data.uint32 = uid; - /* unpack the number of keys */ - cnt=1; - if (PMIX_SUCCESS != (rc = pmix_bfrop.unpack(buf, &nkeys, &cnt, PMIX_SIZE))) { - PMIX_ERROR_LOG(rc); - return rc; - } - /* unpack the array of keys */ - for (i=0; i < nkeys; i++) { - cnt=1; - if (PMIX_SUCCESS != (rc = pmix_bfrop.unpack(buf, &sptr, &cnt, PMIX_STRING))) { - PMIX_ERROR_LOG(rc); - goto cleanup; - } - pmix_argv_append_nosize(&keys, sptr); - free(sptr); - } - /* call the local server */ (void)strncpy(proc.nspace, peer->info->nptr->nspace, PMIX_MAX_NSLEN); proc.rank = peer->info->rank; - rc = pmix_host_server.lookup(&proc, range, info, einfo, keys, cbfunc, cbdata); + rc = pmix_host_server.lookup(&proc, keys, info, einfo, cbfunc, cbdata); cleanup: PMIX_INFO_FREE(info, einfo); @@ -1132,12 +1110,11 @@ pmix_status_t pmix_server_unpublish(pmix_peer_t *peer, { int32_t cnt; pmix_status_t rc; - pmix_data_range_t range; - size_t i, nkeys; + size_t i, nkeys, ninfo, einfo; char **keys=NULL, *sptr; pmix_proc_t proc; uint32_t uid; - pmix_info_t info; + pmix_info_t *info; pmix_output_verbose(2, pmix_globals.debug_output, "recvd UNPUBLISH"); @@ -1152,12 +1129,6 @@ pmix_status_t pmix_server_unpublish(pmix_peer_t *peer, PMIX_ERROR_LOG(rc); return rc; } - /* unpack the range */ - cnt=1; - if (PMIX_SUCCESS != (rc = pmix_bfrop.unpack(buf, &range, &cnt, PMIX_DATA_RANGE))) { - PMIX_ERROR_LOG(rc); - return rc; - } /* unpack the number of keys */ cnt=1; if (PMIX_SUCCESS != (rc = pmix_bfrop.unpack(buf, &nkeys, &cnt, PMIX_SIZE))) { @@ -1174,16 +1145,32 @@ pmix_status_t pmix_server_unpublish(pmix_peer_t *peer, pmix_argv_append_nosize(&keys, sptr); free(sptr); } - /* setup the info key */ - PMIX_INFO_CONSTRUCT(&info); - (void)strncpy(info.key, PMIX_USERID, PMIX_MAX_KEYLEN); - info.value.type = PMIX_UINT32; - info.value.data.uint32 = uid; + /* unpack the number of info objects */ + cnt=1; + if (PMIX_SUCCESS != (rc = pmix_bfrop.unpack(buf, &ninfo, &cnt, PMIX_SIZE))) { + PMIX_ERROR_LOG(rc); + return rc; + } + /* we will be adding one for the user id */ + einfo = ninfo + 1; + PMIX_INFO_CREATE(info, einfo); + /* unpack the array of info objects */ + if (0 < ninfo) { + PMIX_INFO_CREATE(info, ninfo); + cnt=ninfo; + if (PMIX_SUCCESS != (rc = pmix_bfrop.unpack(buf, info, &cnt, PMIX_INFO))) { + PMIX_ERROR_LOG(rc); + goto cleanup; + } + } + (void)strncpy(info[einfo-1].key, PMIX_USERID, PMIX_MAX_KEYLEN); + info[einfo-1].value.type = PMIX_UINT32; + info[einfo-1].value.data.uint32 = uid; /* call the local server */ (void)strncpy(proc.nspace, peer->info->nptr->nspace, PMIX_MAX_NSLEN); proc.rank = peer->info->rank; - rc = pmix_host_server.unpublish(&proc, range, &info, 1, keys, cbfunc, cbdata); + rc = pmix_host_server.unpublish(&proc, keys, info, einfo, cbfunc, cbdata); cleanup: pmix_argv_free(keys); diff --git a/opal/mca/pmix/pmix1xx/pmix/test/server_callbacks.c b/opal/mca/pmix/pmix1xx/pmix/test/server_callbacks.c index 34f6b971c0..8bd03942fe 100644 --- a/opal/mca/pmix/pmix1xx/pmix/test/server_callbacks.c +++ b/opal/mca/pmix/pmix1xx/pmix/test/server_callbacks.c @@ -151,7 +151,6 @@ int dmodex_fn(const pmix_proc_t *proc, } int publish_fn(const pmix_proc_t *proc, - pmix_data_range_t scope, pmix_persistence_t persist, const pmix_info_t info[], size_t ninfo, pmix_op_cbfunc_t cbfunc, void *cbdata) { @@ -184,8 +183,8 @@ int publish_fn(const pmix_proc_t *proc, return PMIX_SUCCESS; } -int lookup_fn(const pmix_proc_t *proc, pmix_data_range_t scope, - const pmix_info_t info[], size_t ninfo, char **keys, +int lookup_fn(const pmix_proc_t *proc, char **keys, + const pmix_info_t info[], size_t ninfo, pmix_lookup_cbfunc_t cbfunc, void *cbdata) { size_t i, ndata, ret; @@ -216,26 +215,26 @@ int lookup_fn(const pmix_proc_t *proc, pmix_data_range_t scope, return PMIX_SUCCESS; } -int unpublish_fn(const pmix_proc_t *proc, - pmix_data_range_t scope, char **keys, +int unpublish_fn(const pmix_proc_t *proc, char **keys, + const pmix_info_t info[], size_t ninfo, pmix_op_cbfunc_t cbfunc, void *cbdata) { - size_t i, ninfo; - pmix_test_info_t *info, *next; + size_t i; + pmix_test_info_t *iptr, *next; if (NULL == pmix_test_published_list) { return PMIX_ERR_NOT_FOUND; } - PMIX_LIST_FOREACH_SAFE(info, next, pmix_test_published_list, pmix_test_info_t) { - if (1) {// if data posted by this process + PMIX_LIST_FOREACH_SAFE(iptr, next, pmix_test_published_list, pmix_test_info_t) { + if (1) { // if data posted by this process if (NULL == keys) { - pmix_list_remove_item(pmix_test_published_list, &info->super); - PMIX_RELEASE(info); + pmix_list_remove_item(pmix_test_published_list, &iptr->super); + PMIX_RELEASE(iptr); } else { ninfo = pmix_argv_count(keys); for (i = 0; i < ninfo; i++) { - if (!strcmp(info->data.key, keys[i])) { - pmix_list_remove_item(pmix_test_published_list, &info->super); - PMIX_RELEASE(info); + if (!strcmp(iptr->data.key, keys[i])) { + pmix_list_remove_item(pmix_test_published_list, &iptr->super); + PMIX_RELEASE(iptr); break; } } diff --git a/opal/mca/pmix/pmix1xx/pmix/test/server_callbacks.h b/opal/mca/pmix/pmix1xx/pmix/test/server_callbacks.h index 456ea56a33..88075f0161 100644 --- a/opal/mca/pmix/pmix1xx/pmix/test/server_callbacks.h +++ b/opal/mca/pmix/pmix1xx/pmix/test/server_callbacks.h @@ -29,14 +29,13 @@ pmix_status_t dmodex_fn(const pmix_proc_t *proc, const pmix_info_t info[], size_t ninfo, pmix_modex_cbfunc_t cbfunc, void *cbdata); pmix_status_t publish_fn(const pmix_proc_t *proc, - pmix_data_range_t scope, pmix_persistence_t persist, const pmix_info_t info[], size_t ninfo, pmix_op_cbfunc_t cbfunc, void *cbdata); -pmix_status_t lookup_fn(const pmix_proc_t *proc, pmix_data_range_t scope, - const pmix_info_t info[], size_t ninfo, char **keys, +pmix_status_t lookup_fn(const pmix_proc_t *proc, char **keys, + const pmix_info_t info[], size_t ninfo, pmix_lookup_cbfunc_t cbfunc, void *cbdata); -pmix_status_t unpublish_fn(const pmix_proc_t *proc, - pmix_data_range_t scope, char **keys, +pmix_status_t unpublish_fn(const pmix_proc_t *proc, char **keys, + const pmix_info_t info[], size_t ninfo, pmix_op_cbfunc_t cbfunc, void *cbdata); pmix_status_t spawn_fn(const pmix_proc_t *proc, const pmix_info_t job_info[], size_t ninfo, diff --git a/opal/mca/pmix/pmix1xx/pmix/test/simple/simpclient.c b/opal/mca/pmix/pmix1xx/pmix/test/simple/simpclient.c index adf784d49c..24b197b37d 100644 --- a/opal/mca/pmix/pmix1xx/pmix/test/simple/simpclient.c +++ b/opal/mca/pmix/pmix1xx/pmix/test/simple/simpclient.c @@ -44,11 +44,11 @@ int main(int argc, char **argv) char *tmp; pmix_proc_t proc, myproc; uint32_t nprocs, n; - + /* init us */ if (PMIX_SUCCESS != (rc = PMIx_Init(&myproc))) { pmix_output(0, "Client ns %s rank %d: PMIx_Init failed: %d", myproc.nspace, myproc.rank, rc); - exit(0); + exit(rc); } pmix_output(0, "Client ns %s rank %d: Running", myproc.nspace, myproc.rank); @@ -60,7 +60,7 @@ int main(int argc, char **argv) nprocs = val->data.uint32; PMIX_VALUE_RELEASE(val); pmix_output(0, "Client %s:%d universe size %d", myproc.nspace, myproc.rank, nprocs); - + /* put a few values */ (void)asprintf(&tmp, "%s-%d-internal", myproc.nspace, myproc.rank); value.type = PMIX_UINT32; @@ -99,7 +99,7 @@ int main(int argc, char **argv) pmix_output(0, "Client ns %s rank %d: PMIx_Fence failed: %d", myproc.nspace, myproc.rank, rc); goto done; } - + /* check the returned data */ (void)strncpy(proc.nspace, myproc.nspace, PMIX_MAX_NSLEN); for (n=0; n < nprocs; n++) { @@ -156,5 +156,5 @@ int main(int argc, char **argv) fprintf(stderr, "Client ns %s rank %d:PMIx_Finalize successfully completed\n", myproc.nspace, myproc.rank); } fflush(stderr); - return(0); + return(rc); } diff --git a/opal/mca/pmix/pmix1xx/pmix/test/simple/simppub.c b/opal/mca/pmix/pmix1xx/pmix/test/simple/simppub.c index e4fe8438e9..b7278fc5b4 100644 --- a/opal/mca/pmix/pmix1xx/pmix/test/simple/simppub.c +++ b/opal/mca/pmix/pmix1xx/pmix/test/simple/simppub.c @@ -63,7 +63,7 @@ int main(int argc, char **argv) nprocs = val->data.uint32; PMIX_VALUE_RELEASE(val); pmix_output(0, "Client %s:%d universe size %d", myproc.nspace, myproc.rank, nprocs); - + /* call fence to ensure the data is received */ PMIX_PROC_CONSTRUCT(&proc); (void)strncpy(proc.nspace, myproc.nspace, PMIX_MAX_NSLEN); @@ -72,7 +72,7 @@ int main(int argc, char **argv) pmix_output(0, "Client ns %s rank %d: PMIx_Fence failed: %d", myproc.nspace, myproc.rank, rc); goto done; } - + /* publish something */ if (0 == myproc.rank) { PMIX_INFO_CREATE(info, 2); @@ -82,7 +82,7 @@ int main(int argc, char **argv) (void)strncpy(info[1].key, "PANDA", PMIX_MAX_KEYLEN); info[1].value.type = PMIX_SIZE; info[1].value.data.size = 123456; - if (PMIX_SUCCESS != (rc = PMIx_Publish(PMIX_GLOBAL, PMIX_PERSIST_APP, info, 2))) { + if (PMIX_SUCCESS != (rc = PMIx_Publish(info, 2))) { pmix_output(0, "Client ns %s rank %d: PMIx_Publish failed: %d", myproc.nspace, myproc.rank, rc); goto done; } @@ -100,7 +100,7 @@ int main(int argc, char **argv) if (0 != myproc.rank) { PMIX_PDATA_CREATE(pdata, 1); (void)strncpy(pdata[0].key, "FOOBAR", PMIX_MAX_KEYLEN); - if (PMIX_SUCCESS != (rc = PMIx_Lookup(PMIX_GLOBAL, NULL, 0, pdata, 1))) { + if (PMIX_SUCCESS != (rc = PMIx_Lookup(pdata, 1, NULL, 0))) { pmix_output(0, "Client ns %s rank %d: PMIx_Lookup failed: %d", myproc.nspace, myproc.rank, rc); goto done; } @@ -140,7 +140,7 @@ int main(int argc, char **argv) pmix_argv_append_nosize(&keys, "FOOBAR"); pmix_argv_append_nosize(&keys, "PANDA"); - if (PMIX_SUCCESS != (rc = PMIx_Unpublish(PMIX_GLOBAL, keys))) { + if (PMIX_SUCCESS != (rc = PMIx_Unpublish(keys, NULL, 0))) { pmix_output(0, "Client ns %s rank %d: PMIx_Unpublish failed: %d", myproc.nspace, myproc.rank, rc); goto done; } diff --git a/opal/mca/pmix/pmix1xx/pmix/test/simple/simptest.c b/opal/mca/pmix/pmix1xx/pmix/test/simple/simptest.c index d555e68d34..1b4cfd93d6 100644 --- a/opal/mca/pmix/pmix1xx/pmix/test/simple/simptest.c +++ b/opal/mca/pmix/pmix1xx/pmix/test/simple/simptest.c @@ -24,17 +24,24 @@ #include #include +#include #include #include #include #include +#include +#include +#include +#include +#include PMIX_EVENT_HEADER #include "src/util/pmix_environ.h" #include "src/util/output.h" #include "src/util/printf.h" #include "src/util/argv.h" #include "src/buffer_ops/buffer_ops.h" +#include "src/usock/usock.h" static pmix_status_t connected(const pmix_proc_t *proc, void *server_object); static pmix_status_t finalized(const pmix_proc_t *proc, void *server_object, @@ -51,14 +58,13 @@ static pmix_status_t dmodex_fn(const pmix_proc_t *proc, const pmix_info_t info[], size_t ninfo, pmix_modex_cbfunc_t cbfunc, void *cbdata); static pmix_status_t publish_fn(const pmix_proc_t *proc, - pmix_data_range_t scope, pmix_persistence_t persist, const pmix_info_t info[], size_t ninfo, pmix_op_cbfunc_t cbfunc, void *cbdata); -static pmix_status_t lookup_fn(const pmix_proc_t *proc, pmix_data_range_t scope, - const pmix_info_t info[], size_t ninfo, char **keys, +static pmix_status_t lookup_fn(const pmix_proc_t *proc, char **keys, + const pmix_info_t info[], size_t ninfo, pmix_lookup_cbfunc_t cbfunc, void *cbdata); -static pmix_status_t unpublish_fn(const pmix_proc_t *proc, - pmix_data_range_t scope, char **keys, +static pmix_status_t unpublish_fn(const pmix_proc_t *proc, char **keys, + const pmix_info_t info[], size_t ninfo, pmix_op_cbfunc_t cbfunc, void *cbdata); static pmix_status_t spawn_fn(const pmix_proc_t *proc, const pmix_info_t job_info[], size_t ninfo, @@ -72,8 +78,6 @@ static pmix_status_t disconnect_fn(const pmix_proc_t procs[], size_t nprocs, pmix_op_cbfunc_t cbfunc, void *cbdata); static pmix_status_t register_event_fn(const pmix_info_t info[], size_t ninfo, pmix_op_cbfunc_t cbfunc, void *cbdata); -static pmix_status_t listener_fn(int listening_sd, - pmix_connection_cbfunc_t cbfunc); static pmix_server_module_t mymodule = { connected, @@ -88,7 +92,7 @@ static pmix_server_module_t mymodule = { connect_fn, disconnect_fn, register_event_fn, - listener_fn + NULL }; typedef struct { @@ -101,7 +105,7 @@ PMIX_CLASS_INSTANCE(pmix_locdat_t, typedef struct { pmix_object_t super; - volatile bool completed; + volatile bool active; pmix_proc_t caller; pmix_info_t *info; size_t ninfo; @@ -113,7 +117,7 @@ static void xfcon(myxfer_t *p) { p->info = NULL; p->ninfo = 0; - p->completed = false; + p->active = true; p->cbfunc = NULL; p->spcbfunc = NULL; p->cbdata = NULL; @@ -128,26 +132,35 @@ PMIX_CLASS_INSTANCE(myxfer_t, pmix_object_t, xfcon, xfdes); +typedef struct { + pmix_list_item_t super; + pid_t pid; +} wait_tracker_t; +PMIX_CLASS_INSTANCE(wait_tracker_t, + pmix_list_item_t, + NULL, NULL); + static volatile int wakeup; static pmix_list_t pubdata; +static pmix_event_t handler; +static pmix_list_t children; static void set_namespace(int nprocs, char *ranks, char *nspace, pmix_op_cbfunc_t cbfunc, myxfer_t *x); static void errhandler(pmix_status_t status, pmix_proc_t procs[], size_t nprocs, pmix_info_t info[], size_t ninfo); +static void wait_signal_callback(int fd, short event, void *arg); static void opcbfunc(pmix_status_t status, void *cbdata) { myxfer_t *x = (myxfer_t*)cbdata; - x->completed = true; - /* release the caller, if necessary - note that - * this may result in release of x, so this must - * be the last thing we do with it here */ + /* release the caller, if necessary */ if (NULL != x->cbfunc) { x->cbfunc(PMIX_SUCCESS, x->cbdata); } + x->active = false; } int main(int argc, char **argv) @@ -161,6 +174,7 @@ int main(int argc, char **argv) pid_t pid; myxfer_t *x; pmix_proc_t proc; + wait_tracker_t *child; /* smoke test */ if (PMIX_SUCCESS != 0) { @@ -181,6 +195,12 @@ int main(int argc, char **argv) /* setup the pub data, in case it is used */ PMIX_CONSTRUCT(&pubdata, pmix_list_t); + /* setup to see sigchld on the forked tests */ + PMIX_CONSTRUCT(&children, pmix_list_t); + event_assign(&handler, pmix_globals.evbase, SIGCHLD, + EV_SIGNAL|EV_PERSIST,wait_signal_callback, &handler); + event_add(&handler, NULL); + /* see if we were passed the number of procs to run or * the executable to use */ for (n=1; n < (argc-1); n++) { @@ -208,7 +228,6 @@ int main(int argc, char **argv) tmp = pmix_argv_join(atmp, ','); x = PMIX_NEW(myxfer_t); set_namespace(nprocs, tmp, "foobar", opcbfunc, x); - free(tmp); /* set common argv and env */ client_env = pmix_argv_copy(environ); @@ -220,12 +239,8 @@ int main(int argc, char **argv) /* if the nspace registration hasn't completed yet, * wait for it here */ - while (!x->completed) { - struct timespec ts; - ts.tv_sec = 0; - ts.tv_nsec = 100000; - nanosleep(&ts, NULL); - } + PMIX_WAIT_FOR_COMPLETION(x->active); + free(tmp); PMIX_RELEASE(x); /* fork/exec the test */ @@ -246,12 +261,7 @@ int main(int argc, char **argv) } /* don't fork/exec the client until we know it is registered * so we avoid a potential race condition in the server */ - while (!x->completed) { - struct timespec ts; - ts.tv_sec = 0; - ts.tv_nsec = 100000; - nanosleep(&ts, NULL); - } + PMIX_WAIT_FOR_COMPLETION(x->active); PMIX_RELEASE(x); pid = fork(); if (pid < 0) { @@ -259,6 +269,9 @@ int main(int argc, char **argv) PMIx_server_finalize(); return -1; } + child = PMIX_NEW(wait_tracker_t); + child->pid = pid; + pmix_list_append(&children, &child->super); if (pid == 0) { execve(executable, client_argv, client_env); @@ -438,7 +451,6 @@ static int dmodex_fn(const pmix_proc_t *proc, static int publish_fn(const pmix_proc_t *proc, - pmix_data_range_t scope, pmix_persistence_t persist, const pmix_info_t info[], size_t ninfo, pmix_op_cbfunc_t cbfunc, void *cbdata) { @@ -462,9 +474,8 @@ static int publish_fn(const pmix_proc_t *proc, } -static int lookup_fn(const pmix_proc_t *proc, - pmix_data_range_t scope, - const pmix_info_t info[], size_t ninfo, char **keys, +static int lookup_fn(const pmix_proc_t *proc, char **keys, + const pmix_info_t info[], size_t ninfo, pmix_lookup_cbfunc_t cbfunc, void *cbdata) { pmix_locdat_t *p, *p2; @@ -512,8 +523,8 @@ static int lookup_fn(const pmix_proc_t *proc, } -static int unpublish_fn(const pmix_proc_t *proc, - pmix_data_range_t scope, char **keys, +static int unpublish_fn(const pmix_proc_t *proc, char **keys, + const pmix_info_t info[], size_t ninfo, pmix_op_cbfunc_t cbfunc, void *cbdata) { pmix_locdat_t *p, *p2; @@ -610,10 +621,39 @@ static pmix_status_t register_event_fn(const pmix_info_t info[], size_t ninfo, return PMIX_SUCCESS; } -static int listener_fn(int listening_sd, - pmix_connection_cbfunc_t cbfunc) +static void wait_signal_callback(int fd, short event, void *arg) { - return PMIX_SUCCESS; + pmix_event_t *sig = (pmix_event_t*) arg; + int status; + pid_t pid; + wait_tracker_t *t2; + + if (SIGCHLD != event_get_signal(sig)) { + return; + } + + /* we can have multiple children leave but only get one + * sigchild callback, so reap all the waitpids until we + * don't get anything valid back */ + while (1) { + pid = waitpid(-1, &status, WNOHANG); + if (-1 == pid && EINTR == errno) { + /* try it again */ + continue; + } + /* if we got garbage, then nothing we can do */ + if (pid <= 0) { + return; + } + + /* we are already in an event, so it is safe to access the list */ + PMIX_LIST_FOREACH(t2, &children, wait_tracker_t) { + if (pid == t2->pid) { + /* found it! */ + --wakeup; + break; + } + } + } } - diff --git a/opal/mca/pmix/pmix1xx/pmix/test/test_fence.c b/opal/mca/pmix/pmix1xx/pmix/test/test_fence.c index c5ec09b3c3..c33fb48dd1 100644 --- a/opal/mca/pmix/pmix1xx/pmix/test/test_fence.c +++ b/opal/mca/pmix/pmix1xx/pmix/test/test_fence.c @@ -86,11 +86,11 @@ static void add_noise(char *noise_param, char *my_nspace, int my_rank) SET_KEY(key, fence_num, ind, use_same_keys); \ (void)strncpy(foobar.nspace, ns, PMIX_MAX_NSLEN); \ foobar.rank = r; \ - TEST_VERBOSE(("%s:%d want to get from %s:%d key %s", my_nspace, my_rank, ns, r, key)); \ + TEST_VERBOSE(("%s:%d want to get from %s:%d key %s", my_nspace, my_rank, ns, r, key)); \ if (blocking) { \ - if (PMIX_SUCCESS != (rc = PMIx_Get(&foobar, key, NULL, 0, &val))) { \ + if (PMIX_SUCCESS != (rc = PMIx_Get(&foobar, key, NULL, 0, &val))) { \ if( !( rc == PMIX_ERR_NOT_FOUND && ok_notfnd ) ){ \ - TEST_ERROR(("%s:%d: PMIx_Get failed: %d from %s:%d", my_nspace, my_rank, rc, ns, r)); \ + TEST_ERROR(("%s:%d: PMIx_Get failed: %d from %s:%d, key %s", my_nspace, my_rank, rc, ns, r, key)); \ } \ rc = PMIX_ERROR; \ } \ @@ -99,8 +99,8 @@ static void add_noise(char *noise_param, char *my_nspace, int my_rank) cbdata.in_progress = 1; \ PMIX_VALUE_CREATE(val, 1); \ cbdata.kv = val; \ - if (PMIX_SUCCESS != (rc = PMIx_Get_nb(&foobar, key, NULL, 0, get_cb, (void*)&cbdata))) { \ - TEST_VERBOSE(("%s:%d: PMIx_Get_nb failed: %d from %s:%d", my_nspace, my_rank, rc, ns, r)); \ + if (PMIX_SUCCESS != (rc = PMIx_Get_nb(&foobar, key, NULL, 0, get_cb, (void*)&cbdata))) { \ + TEST_VERBOSE(("%s:%d: PMIx_Get_nb failed: %d from %s:%d, key=%s", my_nspace, my_rank, rc, ns, r, key)); \ rc = PMIX_ERROR; \ } else { \ count = 0; \ @@ -116,7 +116,8 @@ static void add_noise(char *noise_param, char *my_nspace, int my_rank) if (PMIX_SUCCESS == rc) { \ if( PMIX_SUCCESS != cbdata.status ){ \ if( !( rc == PMIX_ERR_NOT_FOUND && ok_notfnd ) ){ \ - TEST_VERBOSE(("%s:%d: PMIx_Get_nb failed: %d from %s:%d", my_nspace, my_rank, rc, my_nspace, r));\ + TEST_VERBOSE(("%s:%d: PMIx_Get_nb failed: %d from %s:%d, key=%s", \ + my_nspace, my_rank, rc, my_nspace, r)); \ } \ rc = PMIX_ERROR; \ } else if (NULL == val) { \ diff --git a/opal/mca/pmix/pmix1xx/pmix/test/test_publish.c b/opal/mca/pmix/pmix1xx/pmix/test/test_publish.c index ac624b142f..17820b3d40 100644 --- a/opal/mca/pmix/pmix1xx/pmix/test/test_publish.c +++ b/opal/mca/pmix/pmix1xx/pmix/test/test_publish.c @@ -59,10 +59,10 @@ static int test_publish(char *my_nspace, int my_rank, int blocking) info.value.type = PMIX_STRING; info.value.data.string = strdup(data); if (blocking) { - rc = PMIx_Publish(PMIX_NAMESPACE, PMIX_PERSIST_INDEF, &info, 1); + rc = PMIx_Publish(&info, 1); } else { int in_progress = 1; - rc = PMIx_Publish_nb(PMIX_NAMESPACE, PMIX_PERSIST_INDEF, &info, 1, release_cb, &in_progress); + rc = PMIx_Publish_nb(&info, 1, release_cb, &in_progress); if (PMIX_SUCCESS == rc) { PMIX_WAIT_FOR_COMPLETION(in_progress); } @@ -83,7 +83,7 @@ static int test_lookup(char *my_nspace, int my_rank, int blocking) (void)snprintf(data, 512, "data from proc %s:%d", my_nspace, my_rank); if (blocking) { - if (PMIX_SUCCESS != (rc = PMIx_Lookup(PMIX_NAMESPACE, NULL, 0, &pdata, 1))) { + if (PMIX_SUCCESS != (rc = PMIx_Lookup(&pdata, 1, NULL, 0))) { PMIX_PDATA_DESTRUCT(&pdata); return rc; } @@ -98,7 +98,7 @@ static int test_lookup(char *my_nspace, int my_rank, int blocking) cbdata.pdata = &pdata; /* copy the key across */ (void)strncpy(pdata.key, keys[0], PMIX_MAX_KEYLEN); - rc = PMIx_Lookup_nb(PMIX_NAMESPACE, keys, NULL, 0, lookup_cb, (void*)&cbdata); + rc = PMIx_Lookup_nb(keys, NULL, 0, lookup_cb, (void*)&cbdata); if (PMIX_SUCCESS != rc) { PMIX_PDATA_DESTRUCT(&pdata); return rc; @@ -130,10 +130,10 @@ static int test_unpublish(char *my_nspace, int my_rank, int blocking) keys[1] = NULL; if (blocking) { - rc = PMIx_Unpublish(PMIX_NAMESPACE, keys); + rc = PMIx_Unpublish(keys, NULL, 0); } else { int in_progress = 1; - rc = PMIx_Unpublish_nb(PMIX_NAMESPACE, keys, release_cb, &in_progress); + rc = PMIx_Unpublish_nb(keys, NULL, 0, release_cb, &in_progress); if (PMIX_SUCCESS == rc) { PMIX_WAIT_FOR_COMPLETION(in_progress); } diff --git a/opal/mca/pmix/pmix1xx/pmix1.h b/opal/mca/pmix/pmix1xx/pmix1.h index 5cdfdf385d..8141c26458 100644 --- a/opal/mca/pmix/pmix1xx/pmix1.h +++ b/opal/mca/pmix/pmix1xx/pmix1.h @@ -89,20 +89,15 @@ OPAL_MODULE_DECLSPEC int pmix1_get(const opal_process_name_t *proc, OPAL_MODULE_DECLSPEC int pmix1_getnb(const opal_process_name_t *proc, const char *key, opal_pmix_value_cbfunc_t cbfunc, void *cbdata); -OPAL_MODULE_DECLSPEC int pmix1_publish(opal_pmix_data_range_t scope, - opal_pmix_persistence_t persist, - opal_list_t *info); -OPAL_MODULE_DECLSPEC int pmix1_publishnb(opal_pmix_data_range_t scope, - opal_pmix_persistence_t persist, - opal_list_t *info, +OPAL_MODULE_DECLSPEC int pmix1_publish(opal_list_t *info); +OPAL_MODULE_DECLSPEC int pmix1_publishnb(opal_list_t *info, + opal_pmix_op_cbfunc_t cbfunc, void *cbdata); +OPAL_MODULE_DECLSPEC int pmix1_lookup(opal_list_t *data, opal_list_t *info); +OPAL_MODULE_DECLSPEC int pmix1_lookupnb(char **keys, opal_list_t *info, + opal_pmix_lookup_cbfunc_t cbfunc, void *cbdata); +OPAL_MODULE_DECLSPEC int pmix1_unpublish(char **keys, opal_list_t *info); +OPAL_MODULE_DECLSPEC int pmix1_unpublishnb(char **keys, opal_list_t *info, opal_pmix_op_cbfunc_t cbfunc, void *cbdata); -OPAL_MODULE_DECLSPEC int pmix1_lookup(opal_pmix_data_range_t scope, - opal_list_t *data); -OPAL_MODULE_DECLSPEC int pmix1_lookupnb(opal_pmix_data_range_t scope, int wait, char **keys, - opal_pmix_lookup_cbfunc_t cbfunc, void *cbdata); -OPAL_MODULE_DECLSPEC int pmix1_unpublish(opal_pmix_data_range_t scope, char **keys); -OPAL_MODULE_DECLSPEC int pmix1_unpublishnb(opal_pmix_data_range_t scope, char **keys, - opal_pmix_op_cbfunc_t cbfunc, void *cbdata); OPAL_MODULE_DECLSPEC int pmix1_spawn(opal_list_t *job_info, opal_list_t *apps, opal_jobid_t *jobid); OPAL_MODULE_DECLSPEC int pmix1_spawnnb(opal_list_t *job_info, opal_list_t *apps, opal_pmix_spawn_cbfunc_t cbfunc, void *cbdata); @@ -119,8 +114,6 @@ OPAL_MODULE_DECLSPEC int pmix1_resolve_peers(const char *nodename, opal_jobid_t OPAL_MODULE_DECLSPEC int pmix1_resolve_nodes(opal_jobid_t jobid, char **nodelist); /**** COMMON FUNCTIONS ****/ -OPAL_MODULE_DECLSPEC void pmix1_register_errhandler(opal_pmix_errhandler_fn_t errhandler); -OPAL_MODULE_DECLSPEC void pmix1_deregister_errhandler(void); OPAL_MODULE_DECLSPEC int pmix1_store_local(const opal_process_name_t *proc, opal_value_t *val); diff --git a/opal/mca/pmix/pmix1xx/pmix1_client.c b/opal/mca/pmix/pmix1xx/pmix1_client.c index 261c0e1be8..ddb404dc9a 100644 --- a/opal/mca/pmix/pmix1xx/pmix1_client.c +++ b/opal/mca/pmix/pmix1xx/pmix1_client.c @@ -33,14 +33,42 @@ static pmix_proc_t myproc; static char *dbgvalue=NULL; -static int convert_scope(pmix_scope_t *scope, - opal_pmix_scope_t sc); -static int convert_persistence(pmix_persistence_t *p, - opal_pmix_persistence_t persist); -static int convert_data_range(pmix_data_range_t *sc, - opal_pmix_data_range_t scope); +static void myerr(pmix_status_t status, + pmix_proc_t procs[], size_t nprocs, + pmix_info_t info[], size_t ninfo) +{ + int rc; + opal_list_t plist, ilist; + opal_namelist_t *nm; + opal_value_t *iptr; + size_t n; + /* convert the incoming status */ + rc = pmix1_convert_rc(status); + /* convert the array of procs */ + OBJ_CONSTRUCT(&plist, opal_list_t); + for (n=0; n < nprocs; n++) { + nm = OBJ_NEW(opal_namelist_t); + nm->name.jobid = strtoul(procs[n].nspace, NULL, 10); + nm->name.vpid = procs[n].rank; + opal_list_append(&plist, &nm->super); + } + + /* convert the array of info */ + OBJ_CONSTRUCT(&ilist, opal_list_t); + for (n=0; n < ninfo; n++) { + iptr = OBJ_NEW(opal_value_t); + iptr->key = strdup(info[n].key); + pmix1_value_unload(iptr, &info[n].value); + opal_list_append(&plist, &nm->super); + } + + /* call the base errhandler */ + opal_pmix_base_errhandler(rc, &plist, &ilist); + OPAL_LIST_DESTRUCT(&plist); + OPAL_LIST_DESTRUCT(&ilist); +} int pmix1_client_init(void) { @@ -56,19 +84,28 @@ int pmix1_client_init(void) putenv(dbgvalue); } rc = PMIx_Init(&myproc); - if (PMIX_SUCCESS == rc) { - /* store our jobid and rank */ - opal_convert_string_to_jobid(&pname.jobid, myproc.nspace); - pname.vpid = myproc.rank; - opal_proc_set_name(&pname); + if (PMIX_SUCCESS != rc) { + return pmix1_convert_rc(rc); } - return pmix1_convert_rc(rc); + + /* store our jobid and rank */ + opal_convert_string_to_jobid(&pname.jobid, myproc.nspace); + pname.vpid = myproc.rank; + opal_proc_set_name(&pname); + + /* register the errhandler */ + PMIx_Register_errhandler(NULL, 0, myerr); + return OPAL_SUCCESS; + } int pmix1_client_finalize(void) { pmix_status_t rc; + /* deregister the errhandler */ + PMIx_Deregister_errhandler(); + rc = PMIx_Finalize(); return pmix1_convert_rc(rc); } @@ -222,28 +259,21 @@ int pmix1_fencenb(opal_list_t *procs, int collect_data, } int pmix1_put(opal_pmix_scope_t scope, - opal_value_t *val) + opal_value_t *val) { - pmix_scope_t pscope; pmix_value_t kv; pmix_status_t rc; - int irc; - - /* convert the scope */ - if (OPAL_SUCCESS != (irc = convert_scope(&pscope, scope))) { - return irc; - } PMIX_VALUE_CONSTRUCT(&kv); pmix1_value_load(&kv, val); - rc = PMIx_Put(pscope, val->key, &kv); + rc = PMIx_Put(scope, val->key, &kv); PMIX_VALUE_DESTRUCT(&kv); return pmix1_convert_rc(rc); } int pmix1_get(const opal_process_name_t *proc, - const char *key, opal_value_t **val) + const char *key, opal_value_t **val) { int ret; pmix_value_t *kv; @@ -308,9 +338,8 @@ static void val_cbfunc(pmix_status_t status, OBJ_RELEASE(op); } -int pmix1_getnb(const opal_process_name_t *proc, - const char *key, - opal_pmix_value_cbfunc_t cbfunc, void *cbdata) +int pmix1_getnb(const opal_process_name_t *proc, const char *key, + opal_pmix_value_cbfunc_t cbfunc, void *cbdata) { pmix1_opcaddy_t *op; pmix_status_t rc; @@ -342,29 +371,13 @@ int pmix1_getnb(const opal_process_name_t *proc, return pmix1_convert_rc(rc); } -int pmix1_publish(opal_pmix_data_range_t scope, - opal_pmix_persistence_t persist, - opal_list_t *info) +int pmix1_publish(opal_list_t *info) { - pmix_data_range_t rng; - pmix_persistence_t pst; - int rc; pmix_info_t *pinfo; pmix_status_t ret; opal_value_t *iptr; size_t sz, n; - rc = convert_data_range(&rng, scope); - if (OPAL_SUCCESS != rc) { - OPAL_ERROR_LOG(rc); - return rc; - } - rc = convert_persistence(&pst, scope); - if (OPAL_SUCCESS != rc) { - OPAL_ERROR_LOG(rc); - return rc; - } - sz = opal_list_get_size(info); if (0 < sz) { PMIX_INFO_CREATE(pinfo, sz); @@ -376,33 +389,19 @@ int pmix1_publish(opal_pmix_data_range_t scope, } } - ret = PMIx_Publish(rng, pst, pinfo, sz); + ret = PMIx_Publish(pinfo, sz); return pmix1_convert_rc(ret); } -int pmix1_publishnb(opal_pmix_data_range_t scope, - opal_pmix_persistence_t persist, - opal_list_t *info, - opal_pmix_op_cbfunc_t cbfunc, void *cbdata) +int pmix1_publishnb(opal_list_t *info, + opal_pmix_op_cbfunc_t cbfunc, void *cbdata) { - pmix_data_range_t rng; - pmix_persistence_t pst; - int rc; pmix_status_t ret; opal_value_t *iptr; size_t n; pmix1_opcaddy_t *op; - rc = convert_data_range(&rng, scope); - if (OPAL_SUCCESS != rc) { - return rc; - } - rc = convert_persistence(&pst, persist); - if (OPAL_SUCCESS != rc) { - return rc; - } - /* create the caddy */ op = OBJ_NEW(pmix1_opcaddy_t); op->opcbfunc = cbfunc; @@ -419,34 +418,39 @@ int pmix1_publishnb(opal_pmix_data_range_t scope, } } - ret = PMIx_Publish_nb(rng, pst, op->info, op->sz, opcbfunc, op); + ret = PMIx_Publish_nb(op->info, op->sz, opcbfunc, op); return pmix1_convert_rc(ret); } -int pmix1_lookup(opal_pmix_data_range_t scope, - opal_list_t *data) +int pmix1_lookup(opal_list_t *data, opal_list_t *info) { - pmix_data_range_t rng; pmix_pdata_t *pdata; - size_t sz, n; + pmix_info_t *pinfo; + size_t sz, ninfo, n; int rc; pmix_status_t ret; opal_pmix_pdata_t *d; + opal_value_t *iptr; - rc = convert_data_range(&rng, scope); - if (OPAL_SUCCESS != rc) { - return rc; - } sz = opal_list_get_size(data); - PMIX_PDATA_CREATE(pdata, sz); n=0; OPAL_LIST_FOREACH(d, data, opal_pmix_pdata_t) { (void)strncpy(pdata[n++].key, d->value.key, PMIX_MAX_KEYLEN); } - ret = PMIx_Lookup(rng, NULL, 0, pdata, sz); + ninfo = opal_list_get_size(info); + PMIX_INFO_CREATE(pinfo, ninfo); + n=0; + OPAL_LIST_FOREACH(iptr, info, opal_value_t) { + (void)strncpy(pinfo[n++].key, iptr->key, PMIX_MAX_KEYLEN); + pmix1_value_load(&pinfo[n].value, iptr); + ++n; + } + + ret = PMIx_Lookup(pdata, sz, pinfo, ninfo); + PMIX_INFO_FREE(pinfo, ninfo); if (PMIX_SUCCESS == ret) { /* transfer the data back */ @@ -523,64 +527,82 @@ static void lk_cbfunc(pmix_status_t status, OBJ_RELEASE(op); } -int pmix1_lookupnb(opal_pmix_data_range_t scope, int wait, char **keys, +int pmix1_lookupnb(char **keys, opal_list_t *info, opal_pmix_lookup_cbfunc_t cbfunc, void *cbdata) { - pmix_data_range_t rng; - int rc; pmix_status_t ret; pmix1_opcaddy_t *op; - - rc = convert_data_range(&rng, scope); - if (OPAL_SUCCESS != rc) { - return rc; - } + opal_value_t *iptr; + size_t n; /* create the caddy */ op = OBJ_NEW(pmix1_opcaddy_t); op->lkcbfunc = cbfunc; op->cbdata = cbdata; - ret = PMIx_Lookup_nb(rng, keys, NULL, 0, lk_cbfunc, op); - - return pmix1_convert_rc(ret); -} - -int pmix1_unpublish(opal_pmix_data_range_t scope, char **keys) -{ - int rc; - pmix_status_t ret; - pmix_data_range_t rng; - - rc = convert_data_range(&rng, scope); - if (OPAL_SUCCESS != rc) { - return rc; + op->sz = opal_list_get_size(info); + if (0 < op->sz) { + PMIX_INFO_CREATE(op->info, op->sz); + n=0; + OPAL_LIST_FOREACH(iptr, info, opal_value_t) { + (void)strncpy(op->info[n].key, iptr->key, PMIX_MAX_KEYLEN); + pmix1_value_load(&op->info[n].value, iptr); + ++n; + } } - ret = PMIx_Unpublish(rng, keys); + ret = PMIx_Lookup_nb(keys, op->info, op->sz, lk_cbfunc, op); return pmix1_convert_rc(ret); } -int pmix1_unpublishnb(opal_pmix_data_range_t scope, char **keys, - opal_pmix_op_cbfunc_t cbfunc, void *cbdata) +int pmix1_unpublish(char **keys, opal_list_t *info) +{ + pmix_status_t ret; + size_t ninfo, n; + pmix_info_t *pinfo; + opal_value_t *iptr; + + ninfo = opal_list_get_size(info); + PMIX_INFO_CREATE(pinfo, ninfo); + n=0; + OPAL_LIST_FOREACH(iptr, info, opal_value_t) { + (void)strncpy(pinfo[n++].key, iptr->key, PMIX_MAX_KEYLEN); + pmix1_value_load(&pinfo[n].value, iptr); + ++n; + } + + ret = PMIx_Unpublish(keys, pinfo, ninfo); + PMIX_INFO_FREE(pinfo, ninfo); + + return pmix1_convert_rc(ret); +} + +int pmix1_unpublishnb(char **keys, opal_list_t *info, + opal_pmix_op_cbfunc_t cbfunc, void *cbdata) { - int rc; pmix_status_t ret; - pmix_data_range_t rng; pmix1_opcaddy_t *op; - - rc = convert_data_range(&rng, scope); - if (OPAL_SUCCESS != rc) { - return rc; - } + opal_value_t *iptr; + size_t n; /* create the caddy */ op = OBJ_NEW(pmix1_opcaddy_t); op->opcbfunc = cbfunc; op->cbdata = cbdata; - ret = PMIx_Unpublish_nb(rng, keys, opcbfunc, op); + op->sz = opal_list_get_size(info); + if (0 < op->sz) { + PMIX_INFO_CREATE(op->info, op->sz); + n=0; + OPAL_LIST_FOREACH(iptr, info, opal_value_t) { + (void)strncpy(op->info[n].key, iptr->key, PMIX_MAX_KEYLEN); + pmix1_value_load(&op->info[n].value, iptr); + ++n; + } + } + + ret = PMIx_Unpublish_nb(keys, op->info, op->sz, opcbfunc, op); return pmix1_convert_rc(ret); } @@ -894,77 +916,3 @@ int pmix1_resolve_nodes(opal_jobid_t jobid, char **nodelist) return pmix1_convert_rc(ret);; } - -/*** UTILITY FUNCTIONS ***/ -static int convert_scope(pmix_scope_t *sc, - opal_pmix_scope_t scope) -{ - int rc = PMIX_SUCCESS; - - switch (scope) { - case OPAL_PMIX_SCOPE_UNDEF: - *sc = PMIX_SCOPE_UNDEF; - break; - case OPAL_PMIX_LOCAL: - *sc = PMIX_LOCAL; - break; - case OPAL_PMIX_REMOTE: - *sc = PMIX_REMOTE; - break; - case OPAL_PMIX_GLOBAL: - *sc = PMIX_GLOBAL; - break; - default: - *sc = PMIX_SCOPE_UNDEF; - rc = OPAL_ERR_BAD_PARAM; - break; - } - return rc; -} - -static int convert_persistence(pmix_persistence_t *p, - opal_pmix_persistence_t persist) -{ - int rc = OPAL_SUCCESS; - - switch (persist) { - case OPAL_PMIX_PERSIST_INDEF: - *p = PMIX_PERSIST_INDEF; - break; - case OPAL_PMIX_PERSIST_PROC: - *p = PMIX_PERSIST_PROC; - break; - case OPAL_PMIX_PERSIST_APP: - *p = PMIX_PERSIST_APP; - break; - case OPAL_PMIX_PERSIST_SESSION: - *p = PMIX_PERSIST_SESSION; - break; - default: - *p = PMIX_PERSIST_PROC; - rc = OPAL_ERR_BAD_PARAM; - } - return rc; -} - -static int convert_data_range(pmix_data_range_t *sc, - opal_pmix_data_range_t scope) -{ - int rc = OPAL_SUCCESS; - - switch (scope) { - case OPAL_PMIX_DATA_RANGE_UNDEF: - *sc = PMIX_DATA_RANGE_UNDEF; - break; - case OPAL_PMIX_NAMESPACE: - *sc = PMIX_NAMESPACE; - break; - case OPAL_PMIX_SESSION: - *sc = PMIX_SESSION; - break; - default: - *sc = PMIX_DATA_RANGE_UNDEF; - rc = OPAL_ERR_BAD_PARAM; - } - return rc; -} diff --git a/opal/mca/pmix/pmix1xx/pmix1_server_north.c b/opal/mca/pmix/pmix1xx/pmix1_server_north.c index e400341704..a1d38cd482 100644 --- a/opal/mca/pmix/pmix1xx/pmix1_server_north.c +++ b/opal/mca/pmix/pmix1xx/pmix1_server_north.c @@ -59,15 +59,13 @@ static pmix_status_t server_dmodex_req_fn(const pmix_proc_t *proc, const pmix_info_t info[], size_t ninfo, pmix_modex_cbfunc_t cbfunc, void *cbdata); static pmix_status_t server_publish_fn(const pmix_proc_t *proc, - pmix_data_range_t scope, pmix_persistence_t persist, const pmix_info_t info[], size_t ninfo, pmix_op_cbfunc_t cbfunc, void *cbdata); -static pmix_status_t server_lookup_fn(const pmix_proc_t *proc, pmix_data_range_t scope, - const pmix_info_t info[], size_t ninfo, char **keys, +static pmix_status_t server_lookup_fn(const pmix_proc_t *proc, char **keys, + const pmix_info_t info[], size_t ninfo, pmix_lookup_cbfunc_t cbfunc, void *cbdata); -static pmix_status_t server_unpublish_fn(const pmix_proc_t *proc, - pmix_data_range_t scope, - const pmix_info_t info[], size_t ninfo, char **keys, +static pmix_status_t server_unpublish_fn(const pmix_proc_t *proc, char **keys, + const pmix_info_t info[], size_t ninfo, pmix_op_cbfunc_t cbfunc, void *cbdata); static pmix_status_t server_spawn_fn(const pmix_proc_t *proc, const pmix_info_t job_info[], size_t ninfo, @@ -101,10 +99,6 @@ pmix_server_module_t mymodule = { }; opal_pmix_server_module_t *host_module = NULL; -static int convert_data_range(opal_pmix_data_range_t *sc, - pmix_data_range_t scope); -static int convert_persistence(opal_pmix_persistence_t *p, - pmix_persistence_t persist); static void opal_opcbfunc(int status, void *cbdata) @@ -351,7 +345,6 @@ static pmix_status_t server_dmodex_req_fn(const pmix_proc_t *p, } static pmix_status_t server_publish_fn(const pmix_proc_t *p, - pmix_data_range_t scope, pmix_persistence_t persist, const pmix_info_t info[], size_t ninfo, pmix_op_cbfunc_t cbfunc, void *cbdata) { @@ -359,8 +352,6 @@ static pmix_status_t server_publish_fn(const pmix_proc_t *p, size_t n; pmix1_opalcaddy_t *opalcaddy; opal_process_name_t proc; - opal_pmix_data_range_t oscp; - opal_pmix_persistence_t opers; opal_value_t *oinfo; if (NULL == host_module || NULL == host_module->publish) { @@ -377,16 +368,6 @@ static pmix_status_t server_publish_fn(const pmix_proc_t *p, proc.vpid = p->rank; } - /* convert the data range */ - if (OPAL_SUCCESS != (rc = convert_data_range(&oscp, scope))) { - return pmix1_convert_opalrc(rc); - } - - /* convert the persistence */ - if (OPAL_SUCCESS != (rc = convert_persistence(&opers, persist))) { - return pmix1_convert_opalrc(rc); - } - /* setup the caddy */ opalcaddy = OBJ_NEW(pmix1_opalcaddy_t); opalcaddy->opcbfunc = cbfunc; @@ -404,7 +385,7 @@ static pmix_status_t server_publish_fn(const pmix_proc_t *p, } /* pass it up */ - rc = host_module->publish(&proc, oscp, opers, &opalcaddy->info, opal_opcbfunc, opalcaddy); + rc = host_module->publish(&proc, &opalcaddy->info, opal_opcbfunc, opalcaddy); if (OPAL_SUCCESS != rc) { OBJ_RELEASE(opalcaddy); } @@ -442,13 +423,12 @@ static void opal_lkupcbfunc(int status, OBJ_RELEASE(opalcaddy); } -static pmix_status_t server_lookup_fn(const pmix_proc_t *p, pmix_data_range_t scope, - const pmix_info_t info[], size_t ninfo, char **keys, +static pmix_status_t server_lookup_fn(const pmix_proc_t *p, char **keys, + const pmix_info_t info[], size_t ninfo, pmix_lookup_cbfunc_t cbfunc, void *cbdata) { int rc; pmix1_opalcaddy_t *opalcaddy; - opal_pmix_data_range_t oscp; opal_process_name_t proc; opal_value_t *iptr; size_t n; @@ -467,11 +447,6 @@ static pmix_status_t server_lookup_fn(const pmix_proc_t *p, pmix_data_range_t sc proc.vpid = p->rank; } - /* convert the scope */ - if (OPAL_SUCCESS != (rc = convert_data_range(&oscp, scope))) { - return pmix1_convert_opalrc(rc); - } - /* setup the caddy */ opalcaddy = OBJ_NEW(pmix1_opalcaddy_t); opalcaddy->lkupcbfunc = cbfunc; @@ -489,7 +464,7 @@ static pmix_status_t server_lookup_fn(const pmix_proc_t *p, pmix_data_range_t sc } /* pass it up */ - rc = host_module->lookup(&proc, oscp, &opalcaddy->info, keys, opal_lkupcbfunc, opalcaddy); + rc = host_module->lookup(&proc, keys, &opalcaddy->info, opal_lkupcbfunc, opalcaddy); if (OPAL_SUCCESS != rc) { OBJ_RELEASE(opalcaddy); } @@ -498,15 +473,13 @@ static pmix_status_t server_lookup_fn(const pmix_proc_t *p, pmix_data_range_t sc } -static pmix_status_t server_unpublish_fn(const pmix_proc_t *p, - pmix_data_range_t scope, - const pmix_info_t info[], size_t ninfo, char **keys, +static pmix_status_t server_unpublish_fn(const pmix_proc_t *p, char **keys, + const pmix_info_t info[], size_t ninfo, pmix_op_cbfunc_t cbfunc, void *cbdata) { int rc; pmix1_opalcaddy_t *opalcaddy; opal_process_name_t proc; - opal_pmix_data_range_t oscp; opal_value_t *iptr; size_t n; @@ -524,11 +497,6 @@ static pmix_status_t server_unpublish_fn(const pmix_proc_t *p, proc.vpid = p->rank; } - /* convert the data range */ - if (OPAL_SUCCESS != (rc = convert_data_range(&oscp, scope))) { - return pmix1_convert_opalrc(rc); - } - /* setup the caddy */ opalcaddy = OBJ_NEW(pmix1_opalcaddy_t); opalcaddy->opcbfunc = cbfunc; @@ -546,7 +514,7 @@ static pmix_status_t server_unpublish_fn(const pmix_proc_t *p, } /* pass it up */ - rc = host_module->unpublish(&proc, oscp, &opalcaddy->info, keys, opal_opcbfunc, opalcaddy); + rc = host_module->unpublish(&proc, keys, &opalcaddy->info, opal_opcbfunc, opalcaddy); if (OPAL_SUCCESS != rc) { OBJ_RELEASE(opalcaddy); } @@ -802,54 +770,3 @@ static pmix_status_t server_listener_fn(int listening_sd, rc = host_module->listener(listening_sd, cbfunc); return pmix1_convert_opalrc(rc); } - -/**** UTILITY FUNCTIONS ****/ -static int convert_data_range(opal_pmix_data_range_t *sc, - pmix_data_range_t scope) -{ - int rc = OPAL_SUCCESS; - - switch(scope) { - case PMIX_DATA_RANGE_UNDEF: - *sc = OPAL_PMIX_DATA_RANGE_UNDEF; - break; - case PMIX_NAMESPACE: - *sc = OPAL_PMIX_NAMESPACE; - break; - case PMIX_SESSION: - *sc = OPAL_PMIX_SESSION; - break; - default: - *sc = OPAL_PMIX_DATA_RANGE_UNDEF; - rc = OPAL_ERR_BAD_PARAM; - break; - } - return rc; -} - -static int convert_persistence(opal_pmix_persistence_t *p, - pmix_persistence_t persist) -{ - int rc = OPAL_SUCCESS; - - switch (persist) { - case PMIX_PERSIST_INDEF: - *p = OPAL_PMIX_PERSIST_INDEF; - break; - case PMIX_PERSIST_PROC: - *p = OPAL_PMIX_PERSIST_PROC; - break; - case PMIX_PERSIST_APP: - *p = OPAL_PMIX_PERSIST_APP; - break; - case PMIX_PERSIST_SESSION: - *p = OPAL_PMIX_PERSIST_SESSION; - break; - default: - *p = OPAL_PMIX_PERSIST_PROC; - rc = OPAL_ERR_BAD_PARAM; - } - return rc; -} - - diff --git a/opal/mca/pmix/pmix1xx/pmix1_server_south.c b/opal/mca/pmix/pmix1xx/pmix1_server_south.c index 24d10d613e..ebaf00500a 100644 --- a/opal/mca/pmix/pmix1xx/pmix1_server_south.c +++ b/opal/mca/pmix/pmix1xx/pmix1_server_south.c @@ -48,6 +48,42 @@ extern pmix_server_module_t mymodule; extern opal_pmix_server_module_t *host_module; static char *dbgvalue=NULL; +static void myerr(pmix_status_t status, + pmix_proc_t procs[], size_t nprocs, + pmix_info_t info[], size_t ninfo) +{ + int rc; + opal_list_t plist, ilist; + opal_namelist_t *nm; + opal_value_t *iptr; + size_t n; + + /* convert the incoming status */ + rc = pmix1_convert_rc(status); + + /* convert the array of procs */ + OBJ_CONSTRUCT(&plist, opal_list_t); + for (n=0; n < nprocs; n++) { + nm = OBJ_NEW(opal_namelist_t); + nm->name.jobid = strtoul(procs[n].nspace, NULL, 10); + nm->name.vpid = procs[n].rank; + opal_list_append(&plist, &nm->super); + } + + /* convert the array of info */ + OBJ_CONSTRUCT(&ilist, opal_list_t); + for (n=0; n < ninfo; n++) { + iptr = OBJ_NEW(opal_value_t); + iptr->key = strdup(info[n].key); + pmix1_value_unload(iptr, &info[n].value); + opal_list_append(&plist, &nm->super); + } + + /* call the base errhandler */ + opal_pmix_base_errhandler(rc, &plist, &ilist); + OPAL_LIST_DESTRUCT(&plist); + OPAL_LIST_DESTRUCT(&ilist); +} int pmix1_server_init(opal_pmix_server_module_t *module) { @@ -65,6 +101,8 @@ int pmix1_server_init(opal_pmix_server_module_t *module) /* record the host module */ host_module = module; + /* register the errhandler */ + PMIx_Register_errhandler(NULL, 0, myerr); return OPAL_SUCCESS; } @@ -72,6 +110,9 @@ int pmix1_server_finalize(void) { pmix_status_t rc; + /* deregister the errhandler */ + PMIx_Deregister_errhandler(); + rc = PMIx_server_finalize(); return pmix1_convert_rc(rc); } diff --git a/opal/mca/pmix/pmix1xx/pmix_pmix1.c b/opal/mca/pmix/pmix1xx/pmix_pmix1.c index d6d0f4e793..04ff6277ef 100644 --- a/opal/mca/pmix/pmix1xx/pmix_pmix1.c +++ b/opal/mca/pmix/pmix1xx/pmix_pmix1.c @@ -35,6 +35,7 @@ #include "opal/util/show_help.h" #include "pmix1.h" +#include "opal/mca/pmix/base/base.h" #include "opal/mca/pmix/pmix1xx/pmix/include/pmix/pmix_common.h" @@ -82,37 +83,11 @@ const opal_pmix_base_module_t opal_pmix_pmix1xx_module = { pmix1_server_notify_error, /* utility APIs */ PMIx_Get_version, - pmix1_register_errhandler, - pmix1_deregister_errhandler, + opal_pmix_base_register_handler, + opal_pmix_base_deregister_handler, pmix1_store_local }; -static pmix_notification_fn_t errhandler = NULL; - -static void notification_fn(int status, - opal_list_t *procs, - opal_list_t *info) -{ - /* convert the status */ - - /* convert the list of procs to an array of pmix_proc_t */ - - /* convert the list of info to an array of pmix_info_t */ - - /* pass this down to the notification function - * we were given */ -} - -void pmix1_register_errhandler(opal_pmix_errhandler_fn_t errhandler) -{ - return; -} - -void pmix1_deregister_errhandler(void) -{ - return; -} - int pmix1_store_local(const opal_process_name_t *proc, opal_value_t *val) { diff --git a/opal/mca/pmix/pmix_server.h b/opal/mca/pmix/pmix_server.h index 1acf1753b3..3f34d0a617 100644 --- a/opal/mca/pmix/pmix_server.h +++ b/opal/mca/pmix/pmix_server.h @@ -92,8 +92,6 @@ typedef int (*opal_pmix_server_dmodex_req_fn_t)(opal_process_name_t *proc, opal_ * process is also provided and is expected to be returned on any subsequent * lookup request */ typedef int (*opal_pmix_server_publish_fn_t)(opal_process_name_t *proc, - opal_pmix_data_range_t range, - opal_pmix_persistence_t persist, opal_list_t *info, opal_pmix_op_cbfunc_t cbfunc, void *cbdata); @@ -110,18 +108,16 @@ typedef int (*opal_pmix_server_publish_fn_t)(opal_process_name_t *proc, * how the operation is to be executed (e.g., timeout limits, whether the * lookup should wait until data appears). */ -typedef int (*opal_pmix_server_lookup_fn_t)(opal_process_name_t *proc, - opal_pmix_data_range_t range, - opal_list_t *info, char **keys, +typedef int (*opal_pmix_server_lookup_fn_t)(opal_process_name_t *proc, char **keys, + opal_list_t *info, opal_pmix_lookup_cbfunc_t cbfunc, void *cbdata); /* Delete data from the data store. The host server will be passed a NULL-terminated array * of string keys along with the scope within which the data is expected to have * been published. The callback is to be executed upon completion of the delete * procedure */ -typedef int (*opal_pmix_server_unpublish_fn_t)(opal_process_name_t *proc, - opal_pmix_data_range_t range, - opal_list_t *info, char **keys, +typedef int (*opal_pmix_server_unpublish_fn_t)(opal_process_name_t *proc, char **keys, + opal_list_t *info, opal_pmix_op_cbfunc_t cbfunc, void *cbdata); /* Spawn a set of applications/processes as per the PMIx API. Note that diff --git a/opal/mca/pmix/pmix_types.h b/opal/mca/pmix/pmix_types.h index d6fa76b6dd..e872b48054 100644 --- a/opal/mca/pmix/pmix_types.h +++ b/opal/mca/pmix/pmix_types.h @@ -32,14 +32,17 @@ BEGIN_C_DECLS #define OPAL_PMIX_USERID "pmix.euid" // (uint32_t) effective user id #define OPAL_PMIX_GRPID "pmix.egid" // (uint32_t) effective group id +/* general proc-level attributes */ #define OPAL_PMIX_CPUSET "pmix.cpuset" // (char*) hwloc bitmap applied to proc upon launch #define OPAL_PMIX_CREDENTIAL "pmix.cred" // (char*) security credential assigned to proc #define OPAL_PMIX_SPAWNED "pmix.spawned" // (bool) true if this proc resulted from a call to PMIx_Spawn #define OPAL_PMIX_ARCH "pmix.arch" // (uint32_t) datatype architecture flag + /* scratch directory locations for use by applications */ #define OPAL_PMIX_TMPDIR "pmix.tmpdir" // (char*) top-level tmp dir assigned to session #define OPAL_PMIX_NSDIR "pmix.nsdir" // (char*) sub-tmpdir assigned to namespace #define OPAL_PMIX_PROCDIR "pmix.pdir" // (char*) sub-nsdir assigned to proc + /* information about relative ranks as assigned by the RM */ #define OPAL_PMIX_JOBID "pmix.jobid" // (char*) jobid assigned by scheduler #define OPAL_PMIX_APPNUM "pmix.appnum" // (uint32_t) app number within the job @@ -71,17 +74,20 @@ BEGIN_C_DECLS #define OPAL_PMIX_LOCAL_PEERS "pmix.lpeers" // (char*) comma-delimited string of ranks on this node within the specified nspace #define OPAL_PMIX_LOCAL_CPUSETS "pmix.lcpus" // (char*) colon-delimited cpusets of local peers within the specified nspace #define OPAL_PMIX_PROC_URI "pmix.puri" // (char*) URI containing contact info for proc + /* size info */ #define OPAL_PMIX_UNIV_SIZE "pmix.univ.size" // (uint32_t) #procs in this nspace #define OPAL_PMIX_JOB_SIZE "pmix.job.size" // (uint32_t) #procs in this job #define OPAL_PMIX_LOCAL_SIZE "pmix.local.size" // (uint32_t) #procs in this job on this node #define OPAL_PMIX_NODE_SIZE "pmix.node.size" // (uint32_t) #procs across all jobs on this node #define OPAL_PMIX_MAX_PROCS "pmix.max.size" // (uint32_t) max #procs for this job + /* topology info */ #define OPAL_PMIX_NET_TOPO "pmix.ntopo" // (char*) xml-representation of network topology #define OPAL_PMIX_LOCAL_TOPO "pmix.ltopo" // (char*) xml-representation of local node topology #define OPAL_PMIX_NODE_LIST "pmix.nlist" // (char*) comma-delimited list of nodes running procs for this job #define OPAL_PMIX_TOPOLOGY "pmix.topo" // (hwloc_topology_t) pointer to the PMIx client's internal topology object + /* fault tolerance-related info */ #define OPAL_PMIX_TERMINATE_SESSION "pmix.term.sess" // (bool) RM intends to terminate session #define OPAL_PMIX_TERMINATE_JOB "pmix.term.job" // (bool) RM intends to terminate this job @@ -95,6 +101,9 @@ BEGIN_C_DECLS #define OPAL_PMIX_WAIT "pmix.wait" // (int) caller requests that the server wait until the specified #values are found #define OPAL_PMIX_COLLECTIVE_ALGO "pmix.calgo" // (char*) comma-delimited list of algorithms to use for collective #define OPAL_PMIX_COLLECTIVE_ALGO_REQD "pmix.calreqd" // (bool) if true, indicates that the requested choice of algo is mandatory +#define OPAL_PMIX_NOTIFY_COMPLETION "pmix.notecomp" // (bool) notify parent process upon termination of child job +#define OPAL_PMIX_RANGE "pmix.range" // (int) opal_pmix_data_range_t value for calls to publish/lookup/unpublish +#define OPAL_PMIX_PERSISTENCE "pmix.persist" // (int) opal_pmix_persistence_t value for calls to publish /* attribute used by host server to pass data to the server convenience library - the * data will then be parsed and provided to the local clients */ @@ -126,7 +135,8 @@ BEGIN_C_DECLS #define OPAL_PMIX_STDIN_TGT "pmix.stdin" // (uint32_t) spawned proc rank that is to receive stdin -/* define a scope for data "put" by PMI per the following: +/* define a scope for data "put" by PMI per the following - maintain + * consistent order with the PMIx distro : * * OPAL_PMI_LOCAL - the data is intended only for other application * processes on the same node. Data marked in this way @@ -137,7 +147,7 @@ BEGIN_C_DECLS * OPAL_PMI_GLOBAL - the data is to be shared with all other requesting processes, * regardless of location */ -#define OPAL_PMIX_SCOPE PMIX_UINT32 +#define OPAL_PMIX_SCOPE PMIX_UINT typedef enum { OPAL_PMIX_SCOPE_UNDEF = 0, OPAL_PMIX_LOCAL, // share to procs also on this node @@ -145,15 +155,17 @@ typedef enum { OPAL_PMIX_GLOBAL } opal_pmix_scope_t; -/* define a range for data "published" by PMI */ -#define OPAL_PMIX_DATA_RANGE OPAL_UINT8 +/* define a range for data "published" by PMI - maintain + * consistent order with the PMIx distro */ +#define OPAL_PMIX_DATA_RANGE OPAL_UINT typedef enum { OPAL_PMIX_DATA_RANGE_UNDEF = 0, OPAL_PMIX_NAMESPACE, // data is available to procs in the same nspace only OPAL_PMIX_SESSION // data available to all jobs in this session } opal_pmix_data_range_t; -/* define a "persistence" policy for data published by clients */ +/* define a "persistence" policy for data published by clients - maintain + * consistent order with the PMIx distro */ typedef enum { OPAL_PMIX_PERSIST_INDEF = 0, // retain until specifically deleted OPAL_PMIX_PERSIST_PROC, // retain until publishing process terminates diff --git a/orte/mca/ess/base/ess_base_std_orted.c b/orte/mca/ess/base/ess_base_std_orted.c index dd21aa4fc4..3a10d51c3f 100644 --- a/orte/mca/ess/base/ess_base_std_orted.c +++ b/orte/mca/ess/base/ess_base_std_orted.c @@ -351,6 +351,14 @@ int orte_ess_base_orted_setup(char **hosts) error = "orte_routed_base_select"; goto error; } + /* setup the routed info - the selected routed component + * will know what to do. + */ + if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, NULL))) { + ORTE_ERROR_LOG(ret); + error = "orte_routed.init_routes"; + goto error; + } /* * Group communications */ @@ -645,7 +653,7 @@ int orte_ess_base_orted_finalize(void) /* shutdown the pmix server */ pmix_server_finalize(); (void) mca_base_framework_close(&opal_pmix_base_framework); - + /* close frameworks */ (void) mca_base_framework_close(&orte_schizo_base_framework); (void) mca_base_framework_close(&orte_filem_base_framework); diff --git a/orte/mca/ess/hnp/ess_hnp_module.c b/orte/mca/ess/hnp/ess_hnp_module.c index 0049d74f18..1b6f586bf9 100644 --- a/orte/mca/ess/hnp/ess_hnp_module.c +++ b/orte/mca/ess/hnp/ess_hnp_module.c @@ -647,12 +647,6 @@ static int rte_init(void) error = "opal_pmix_base_select"; goto error; } - /* setup the PMIx server */ - if (ORTE_SUCCESS != (ret = pmix_server_init())) { - ORTE_ERROR_LOG(ret); - error = "pmix server init"; - goto error; - } /* setup the routed info - the selected routed component * will know what to do. @@ -662,6 +656,14 @@ static int rte_init(void) error = "orte_routed.init_routes"; goto error; } + + /* setup the PMIx server */ + if (ORTE_SUCCESS != (ret = pmix_server_init())) { + ORTE_ERROR_LOG(ret); + error = "pmix server init"; + goto error; + } + /* setup I/O forwarding system - must come after we init routes */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_iof_base_framework, 0))) { ORTE_ERROR_LOG(ret); diff --git a/orte/mca/schizo/base/base.h b/orte/mca/schizo/base/base.h index 894fe35f35..2f22ebb875 100644 --- a/orte/mca/schizo/base/base.h +++ b/orte/mca/schizo/base/base.h @@ -65,7 +65,6 @@ ORTE_DECLSPEC int orte_schizo_base_parse_cli(char *personality, ORTE_DECLSPEC int orte_schizo_base_parse_env(char *personality, char *path, opal_cmd_line_t *cmd_line, - char *server, char **srcenv, char ***dstenv); ORTE_DECLSPEC int orte_schizo_base_setup_fork(orte_job_t *jdata, diff --git a/orte/mca/schizo/base/schizo_base_stubs.c b/orte/mca/schizo/base/schizo_base_stubs.c index 663791f4b6..a2e5fe1bf2 100644 --- a/orte/mca/schizo/base/schizo_base_stubs.c +++ b/orte/mca/schizo/base/schizo_base_stubs.c @@ -40,7 +40,6 @@ int orte_schizo_base_parse_cli(char *personality, int orte_schizo_base_parse_env(char *personality, char *path, opal_cmd_line_t *cmd_line, - char *server, char **srcenv, char ***dstenv) { @@ -50,7 +49,7 @@ int orte_schizo_base_parse_env(char *personality, OPAL_LIST_FOREACH(mod, &orte_schizo_base.active_modules, orte_schizo_base_active_module_t) { if (0 == strcmp(personality, mod->component->mca_component_name)) { if (NULL != mod->module->parse_env) { - rc = mod->module->parse_env(personality, path, cmd_line, server, srcenv, dstenv); + rc = mod->module->parse_env(personality, path, cmd_line, srcenv, dstenv); return rc; } } diff --git a/orte/mca/schizo/ompi/schizo_ompi.c b/orte/mca/schizo/ompi/schizo_ompi.c index 2be71ba6d9..ffe5ce85be 100644 --- a/orte/mca/schizo/ompi/schizo_ompi.c +++ b/orte/mca/schizo/ompi/schizo_ompi.c @@ -54,7 +54,6 @@ static int parse_cli(char *personality, static int parse_env(char *personality, char *path, opal_cmd_line_t *cmd_line, - char *server, char **srcenv, char ***dstenv); static int setup_fork(orte_job_t *jdata, @@ -154,7 +153,6 @@ static int parse_cli(char *personality, static int parse_env(char *personality, char *path, opal_cmd_line_t *cmd_line, - char *ompi_server, char **srcenv, char ***dstenv) { @@ -181,11 +179,6 @@ static int parse_env(char *personality, } } - /* add the ompi-server, if provided */ - if (NULL != ompi_server) { - opal_setenv("OMPI_MCA_pubsub_orte_server", ompi_server, true, dstenv); - } - /* set necessary env variables for external usage from tune conf file*/ int set_from_file = 0; vars = NULL; diff --git a/orte/mca/schizo/schizo.h b/orte/mca/schizo/schizo.h index 528ef7b4e5..5b685e503d 100644 --- a/orte/mca/schizo/schizo.h +++ b/orte/mca/schizo/schizo.h @@ -46,7 +46,6 @@ typedef int (*orte_schizo_base_module_parse_cli_fn_t)(char *personality, typedef int (*orte_schizo_base_module_parse_env_fn_t)(char *personality, char *path, opal_cmd_line_t *cmd_line, - char *server, char **srcenv, char ***dstenv); diff --git a/orte/orted/pmix/pmix_server.c b/orte/orted/pmix/pmix_server.c index 845ffe3f45..4649251b29 100644 --- a/orte/orted/pmix/pmix_server.c +++ b/orte/orted/pmix/pmix_server.c @@ -62,6 +62,7 @@ #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/grpcomm/grpcomm.h" #include "orte/mca/rml/rml.h" +#include "orte/mca/rml/base/rml_contact.h" #include "orte/util/name_fns.h" #include "orte/util/session_dir.h" #include "orte/util/show_help.h" @@ -125,21 +126,40 @@ void pmix_server_register_params(void) MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_ALL, &orte_pmix_server_globals.timeout); - orte_pmix_server_globals.timeout = orte_pmix_server_globals.timeout * 1000000; /* register the URI of the UNIVERSAL data server */ + orte_pmix_server_globals.server_uri = NULL; + (void) mca_base_var_register ("orte", "pmix", NULL, "server_uri", + "URI of a session-level keyval server for publish/lookup operations", + MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, + OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_ALL, + &orte_pmix_server_globals.server_uri); - /* if the universal server wasn't specified, then we use - * our own HNP for that purpose */ - orte_pmix_server_globals.server = *ORTE_PROC_MY_HNP; - + /* whether or not to wait for the universal server */ + orte_pmix_server_globals.wait_for_server = false; + (void) mca_base_var_register ("orte", "pmix", NULL, "wait_for_server", + "Whether or not to wait for the session-level server to start", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, + OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_ALL, + &orte_pmix_server_globals.wait_for_server); } static void eviction_cbfunc(struct opal_hotel_t *hotel, int room_num, void *occupant) { pmix_server_req_t *req = (pmix_server_req_t*)occupant; + int rc; + /* decrement the request timeout */ + req->timeout -= orte_pmix_server_globals.timeout; + if (0 < req->timeout) { + /* not done yet - check us back in */ + if (OPAL_SUCCESS == (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) { + return; + } + ORTE_ERROR_LOG(rc); + /* fall thru and return an error so the caller doesn't hang */ + } /* don't let the caller hang */ if (NULL != req->opcbfunc) { req->opcbfunc(OPAL_ERR_TIMEOUT, req->cbdata); @@ -169,7 +189,7 @@ int pmix_server_init(void) OBJ_CONSTRUCT(&orte_pmix_server_globals.reqs, opal_hotel_t); if (OPAL_SUCCESS != (rc = opal_hotel_init(&orte_pmix_server_globals.reqs, orte_pmix_server_globals.num_rooms, - orte_event_base, orte_pmix_server_globals.timeout, + orte_event_base, orte_pmix_server_globals.timeout*1000000, ORTE_ERROR_PRI, eviction_cbfunc))) { ORTE_ERROR_LOG(rc); return rc; @@ -191,12 +211,103 @@ int pmix_server_init(void) orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DATA_CLIENT, ORTE_RML_PERSISTENT, pmix_server_keyval_client, NULL); + /* ensure the PMIx server uses the proper rendezvous directory */ + opal_setenv("PMIX_SERVER_TMPDIR", orte_process_info.proc_session_dir, true, &environ); + /* setup the local server */ if (ORTE_SUCCESS != (rc = opal_pmix.server_init(&pmix_server))) { ORTE_ERROR_LOG(rc); /* memory cleanup will occur when finalize is called */ } + /* if the universal server wasn't specified, then we use + * our own HNP for that purpose */ + if (NULL == orte_pmix_server_globals.server_uri) { + orte_pmix_server_globals.server = *ORTE_PROC_MY_HNP; + } else { + char *server; + opal_buffer_t buf; + if (0 == strncmp(orte_pmix_server_globals.server_uri, "file", strlen("file")) || + 0 == strncmp(orte_pmix_server_globals.server_uri, "FILE", strlen("FILE"))) { + char input[1024], *filename; + FILE *fp; + + /* it is a file - get the filename */ + filename = strchr(orte_pmix_server_globals.server_uri, ':'); + if (NULL == filename) { + /* filename is not correctly formatted */ + orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-bad", true, + orte_basename, orte_pmix_server_globals.server_uri); + return ORTE_ERR_BAD_PARAM; + } + ++filename; /* space past the : */ + + if (0 >= strlen(filename)) { + /* they forgot to give us the name! */ + orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-missing", true, + orte_basename, orte_pmix_server_globals.server_uri); + return ORTE_ERR_BAD_PARAM; + } + + /* open the file and extract the uri */ + fp = fopen(filename, "r"); + if (NULL == fp) { /* can't find or read file! */ + orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-access", true, + orte_basename, orte_pmix_server_globals.server_uri); + return ORTE_ERR_BAD_PARAM; + } + if (NULL == fgets(input, 1024, fp)) { + /* something malformed about file */ + fclose(fp); + orte_show_help("help-orterun.txt", "orterun:ompi-server-file-bad", true, + orte_basename, orte_pmix_server_globals.server_uri, + orte_basename); + return ORTE_ERR_BAD_PARAM; + } + fclose(fp); + input[strlen(input)-1] = '\0'; /* remove newline */ + server = strdup(input); + } else { + server = strdup(orte_pmix_server_globals.server_uri); + } + /* setup our route to the server */ + OBJ_CONSTRUCT(&buf, opal_buffer_t); + opal_dss.pack(&buf, &server, 1, OPAL_STRING); + if (ORTE_SUCCESS != (rc = orte_rml_base_update_contact_info(&buf))) { + ORTE_ERROR_LOG(rc); + ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); + return rc; + } + OBJ_DESTRUCT(&buf); + /* parse the URI to get the server's name */ + if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(server, &orte_pmix_server_globals.server, NULL))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* check if we are to wait for the server to start - resolves + * a race condition that can occur when the server is run + * as a background job - e.g., in scripts + */ + if (orte_pmix_server_globals.wait_for_server) { + /* ping the server */ + struct timeval timeout; + timeout.tv_sec = orte_pmix_server_globals.timeout; + timeout.tv_usec = 0; + if (ORTE_SUCCESS != (rc = orte_rml.ping(server, &timeout))) { + /* try it one more time */ + if (ORTE_SUCCESS != (rc = orte_rml.ping(server, &timeout))) { + /* okay give up */ + orte_show_help("help-orterun.txt", "orterun:server-not-found", true, + orte_basename, server, + (long)orte_pmix_server_globals.timeout, + ORTE_ERROR_NAME(rc)); + ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); + return rc; + } + } + } + } + return rc; } @@ -461,6 +572,7 @@ static void pmix_server_dmdx_resp(int status, orte_process_name_t* sender, static void rqcon(pmix_server_req_t *p) { + p->timeout = orte_pmix_server_globals.timeout; p->jdata = NULL; OBJ_CONSTRUCT(&p->msg, opal_buffer_t); p->opcbfunc = NULL; diff --git a/orte/orted/pmix/pmix_server_internal.h b/orte/orted/pmix/pmix_server_internal.h index 6d327453cb..576c610fc9 100644 --- a/orte/orted/pmix/pmix_server_internal.h +++ b/orte/orted/pmix/pmix_server_internal.h @@ -53,6 +53,7 @@ typedef struct { opal_object_t super; opal_event_t ev; + int timeout; int room_num; int remote_room_num; orte_process_name_t proxy; @@ -146,17 +147,13 @@ extern int pmix_server_fencenb_fn(opal_list_t *procs, opal_list_t *info, extern int pmix_server_dmodex_req_fn(opal_process_name_t *proc, opal_list_t *info, opal_pmix_modex_cbfunc_t cbfunc, void *cbdata); extern int pmix_server_publish_fn(opal_process_name_t *proc, - opal_pmix_data_range_t range, - opal_pmix_persistence_t persist, opal_list_t *info, opal_pmix_op_cbfunc_t cbfunc, void *cbdata); -extern int pmix_server_lookup_fn(opal_process_name_t *proc, - opal_pmix_data_range_t range, - opal_list_t *info, char **keys, +extern int pmix_server_lookup_fn(opal_process_name_t *proc, char **keys, + opal_list_t *info, opal_pmix_lookup_cbfunc_t cbfunc, void *cbdata); -extern int pmix_server_unpublish_fn(opal_process_name_t *proc, - opal_pmix_data_range_t range, - opal_list_t *info, char **keys, +extern int pmix_server_unpublish_fn(opal_process_name_t *proc, char **keys, + opal_list_t *info, opal_pmix_op_cbfunc_t cbfunc, void *cbdata); extern int pmix_server_spawn_fn(opal_process_name_t *requestor, opal_list_t *job_info, opal_list_t *apps, @@ -186,6 +183,8 @@ typedef struct { opal_hotel_t reqs; int num_rooms; int timeout; + char *server_uri; + bool wait_for_server; orte_process_name_t server; } pmix_server_globals_t; diff --git a/orte/orted/pmix/pmix_server_pub.c b/orte/orted/pmix/pmix_server_pub.c index d98fd508e4..306ac56040 100644 --- a/orte/orted/pmix/pmix_server_pub.c +++ b/orte/orted/pmix/pmix_server_pub.c @@ -85,16 +85,16 @@ static void execute(int sd, short args, void *cbdata) } int pmix_server_publish_fn(opal_process_name_t *proc, - opal_pmix_data_range_t range, - opal_pmix_persistence_t persist, opal_list_t *info, opal_pmix_op_cbfunc_t cbfunc, void *cbdata) { pmix_server_req_t *req; int rc; uint8_t cmd = ORTE_PMIX_PUBLISH_CMD; - int32_t ninfo; opal_value_t *iptr; + opal_pmix_data_range_t range = OPAL_PMIX_SESSION; + opal_pmix_persistence_t persist = OPAL_PMIX_PERSIST_APP; + bool rset, pset; /* create the caddy */ req = OBJ_NEW(pmix_server_req_t); @@ -115,6 +115,25 @@ int pmix_server_publish_fn(opal_process_name_t *proc, return rc; } + /* no help for it - need to search for range/persistence */ + rset = false; + pset = false; + OPAL_LIST_FOREACH(iptr, info, opal_value_t) { + if (0 == strcmp(iptr->key, OPAL_PMIX_RANGE)) { + range = iptr->data.integer; + if (pset) { + break; + } + rset = true; + } else if (0 == strcmp(iptr->key, OPAL_PMIX_PERSISTENCE)) { + persist = iptr->data.integer; + if (rset) { + break; + } + pset = true; + } + } + /* pack the range */ if (OPAL_SUCCESS != (rc = opal_dss.pack(&req->msg, &range, 1, OPAL_INT))) { ORTE_ERROR_LOG(rc); @@ -136,16 +155,13 @@ int pmix_server_publish_fn(opal_process_name_t *proc, return rc; } - /* pack the number of info items */ - ninfo = opal_list_get_size(info); - if (OPAL_SUCCESS != (rc = opal_dss.pack(&req->msg, &ninfo, 1, OPAL_UINT32))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(req); - return rc; - } - - /* if we have items, pack those too */ + /* if we have items, pack those too - ignore persistence + * and range values */ OPAL_LIST_FOREACH(iptr, info, opal_value_t) { + if (0 == strcmp(iptr->key, OPAL_PMIX_RANGE) || + 0 == strcmp(iptr->key, OPAL_PMIX_PERSISTENCE)) { + continue; + } if (OPAL_SUCCESS != (rc = opal_dss.pack(&req->msg, &iptr, 1, OPAL_VALUE))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(req); @@ -163,17 +179,16 @@ int pmix_server_publish_fn(opal_process_name_t *proc, } -int pmix_server_lookup_fn(opal_process_name_t *proc, - opal_pmix_data_range_t range, - opal_list_t *info, char **keys, +int pmix_server_lookup_fn(opal_process_name_t *proc, char **keys, + opal_list_t *info, opal_pmix_lookup_cbfunc_t cbfunc, void *cbdata) { pmix_server_req_t *req; int rc; uint8_t cmd = ORTE_PMIX_LOOKUP_CMD; int32_t nkeys, i; - int32_t ninfo; opal_value_t *iptr; + opal_pmix_data_range_t range = OPAL_PMIX_SESSION; /* the list of info objects are directives for us - they include * things like timeout constraints, so there is no reason to @@ -191,6 +206,14 @@ int pmix_server_lookup_fn(opal_process_name_t *proc, return rc; } + /* no help for it - need to search for range */ + OPAL_LIST_FOREACH(iptr, info, opal_value_t) { + if (0 == strcmp(iptr->key, OPAL_PMIX_RANGE)) { + range = iptr->data.integer; + break; + } + } + /* pack the range */ if (OPAL_SUCCESS != (rc = opal_dss.pack(&req->msg, &range, 1, OPAL_INT))) { ORTE_ERROR_LOG(rc); @@ -205,23 +228,6 @@ int pmix_server_lookup_fn(opal_process_name_t *proc, req->target = *ORTE_PROC_MY_HNP; } - /* pack the number of info items */ - ninfo = opal_list_get_size(info); - if (OPAL_SUCCESS != (rc = opal_dss.pack(&req->msg, &ninfo, 1, OPAL_UINT32))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(req); - return rc; - } - - /* if we have items, pack those too */ - OPAL_LIST_FOREACH(iptr, info, opal_value_t) { - if (OPAL_SUCCESS != (rc = opal_dss.pack(&req->msg, &iptr, 1, OPAL_VALUE))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(req); - return rc; - } - } - /* pack the number of keys */ nkeys = opal_argv_count(keys); if (OPAL_SUCCESS != (rc = opal_dss.pack(&req->msg, &nkeys, 1, OPAL_UINT32))) { @@ -239,6 +245,18 @@ int pmix_server_lookup_fn(opal_process_name_t *proc, } } + /* if we have items, pack those too - ignore range value */ + OPAL_LIST_FOREACH(iptr, info, opal_value_t) { + if (0 == strcmp(iptr->key, OPAL_PMIX_RANGE)) { + continue; + } + if (OPAL_SUCCESS != (rc = opal_dss.pack(&req->msg, &iptr, 1, OPAL_VALUE))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(req); + return rc; + } + } + /* thread-shift so we can store the tracker */ opal_event_set(orte_event_base, &(req->ev), -1, OPAL_EV_WRITE, execute, req); @@ -248,16 +266,16 @@ int pmix_server_lookup_fn(opal_process_name_t *proc, return OPAL_SUCCESS; } -int pmix_server_unpublish_fn(opal_process_name_t *proc, - opal_pmix_data_range_t range, - opal_list_t *info, char **keys, +int pmix_server_unpublish_fn(opal_process_name_t *proc, char **keys, + opal_list_t *info, opal_pmix_op_cbfunc_t cbfunc, void *cbdata) { pmix_server_req_t *req; int rc; uint8_t cmd = ORTE_PMIX_UNPUBLISH_CMD; - uint32_t nkeys, ninfo; + uint32_t nkeys, n; opal_value_t *iptr; + opal_pmix_data_range_t range = OPAL_PMIX_SESSION; /* create the caddy */ req = OBJ_NEW(pmix_server_req_t); @@ -278,6 +296,14 @@ int pmix_server_unpublish_fn(opal_process_name_t *proc, return rc; } + /* no help for it - need to search for range */ + OPAL_LIST_FOREACH(iptr, info, opal_value_t) { + if (0 == strcmp(iptr->key, OPAL_PMIX_RANGE)) { + range = iptr->data.integer; + break; + } + } + /* pack the range */ if (OPAL_SUCCESS != (rc = opal_dss.pack(&req->msg, &range, 1, OPAL_INT))) { ORTE_ERROR_LOG(rc); @@ -292,22 +318,6 @@ int pmix_server_unpublish_fn(opal_process_name_t *proc, req->target = *ORTE_PROC_MY_HNP; } - /* pack the number of info items */ - ninfo = opal_list_get_size(info); - if (OPAL_SUCCESS != (rc = opal_dss.pack(&req->msg, &ninfo, 1, OPAL_UINT32))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(req); - return rc; - } - - /* if we have items, pack those too */ - OPAL_LIST_FOREACH(iptr, info, opal_value_t) { - if (OPAL_SUCCESS != (rc = opal_dss.pack(&req->msg, &iptr, 1, OPAL_VALUE))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(req); - return rc; - } - } /* pack the number of keys */ nkeys = opal_argv_count(keys); if (OPAL_SUCCESS != (rc = opal_dss.pack(&req->msg, &nkeys, 1, OPAL_UINT32))) { @@ -317,10 +327,24 @@ int pmix_server_unpublish_fn(opal_process_name_t *proc, } /* pack the keys too */ - if (OPAL_SUCCESS != (rc = opal_dss.pack(&req->msg, keys, nkeys, OPAL_STRING))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(req); - return rc; + for (n=0; n < nkeys; n++) { + if (OPAL_SUCCESS != (rc = opal_dss.pack(&req->msg, &keys[n], 1, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(req); + return rc; + } + } + + /* if we have items, pack those too - ignore range value */ + OPAL_LIST_FOREACH(iptr, info, opal_value_t) { + if (0 == strcmp(iptr->key, OPAL_PMIX_RANGE)) { + continue; + } + if (OPAL_SUCCESS != (rc = opal_dss.pack(&req->msg, &iptr, 1, OPAL_VALUE))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(req); + return rc; + } } /* thread-shift so we can store the tracker */ diff --git a/orte/runtime/orte_data_server.c b/orte/runtime/orte_data_server.c index e944c7a83b..ef306c2d16 100644 --- a/orte/runtime/orte_data_server.c +++ b/orte/runtime/orte_data_server.c @@ -80,8 +80,29 @@ OBJ_CLASS_INSTANCE(orte_data_object_t, opal_object_t, construct, destruct); +/* define a request object for delayed answers */ +typedef struct { + opal_list_item_t super; + orte_process_name_t requestor; + uint32_t uid; + opal_pmix_data_range_t range; + char **keys; +} orte_data_req_t; +static void rqcon(orte_data_req_t *p) +{ + p->keys = NULL; +} +static void rqdes(orte_data_req_t *p) +{ + opal_argv_free(p->keys); +} +OBJ_CLASS_INSTANCE(orte_data_req_t, + opal_list_item_t, + rqcon, rqdes); + /* local globals */ static opal_pointer_array_t orte_data_server_store; +static opal_list_t pending; int orte_data_server_init(void) { @@ -96,6 +117,8 @@ int orte_data_server_init(void) return rc; } + OBJ_CONSTRUCT(&pending, opal_list_t); + orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DATA_SERVER, ORTE_RML_PERSISTENT, @@ -118,6 +141,7 @@ void orte_data_server_finalize(void) } } OBJ_DESTRUCT(&orte_data_server_store); + OPAL_LIST_DESTRUCT(&pending); } void orte_data_server(int status, orte_process_name_t* sender, @@ -128,15 +152,16 @@ void orte_data_server(int status, orte_process_name_t* sender, orte_std_cntr_t count; opal_process_name_t requestor; orte_data_object_t *data; - opal_buffer_t *answer; + opal_buffer_t *answer, *reply; int rc, ret, k; opal_value_t *iptr, *inext; uint32_t ninfo, i; char **keys = NULL, *str; - bool ret_packed = false; + bool ret_packed = false, wait = false; int room_number; uint32_t uid; opal_pmix_data_range_t range; + orte_data_req_t *req, *rqnext; OPAL_OUTPUT_VERBOSE((1, orte_debug_output, "%s data server got message from %s", @@ -196,34 +221,66 @@ void orte_data_server(int status, orte_process_name_t* sender, goto SEND_ERROR; } - /* unpack the number of info elements */ count = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &ninfo, &count, OPAL_UINT32))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(data); - goto SEND_ERROR; - } - - if (0 < ninfo) { - for (i=0; i < ninfo; i++) { - count = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &iptr, &count, OPAL_VALUE))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(data); - goto SEND_ERROR; - } - /* if this is the userid, separate it out */ - if (0 == strcmp(iptr->key, OPAL_PMIX_USERID)) { - data->uid = iptr->data.uint32; - OBJ_RELEASE(iptr); - } else { - opal_list_append(&data->values, &iptr->super); - } + while (ORTE_SUCCESS == (rc = opal_dss.unpack(buffer, &iptr, &count, OPAL_VALUE))) { + /* if this is the userid, separate it out */ + if (0 == strcmp(iptr->key, OPAL_PMIX_USERID)) { + data->uid = iptr->data.uint32; + OBJ_RELEASE(iptr); + } else { + opal_list_append(&data->values, &iptr->super); } } data->index = opal_pointer_array_add(&orte_data_server_store, data); + /* check for pending requests that match this data */ + reply = NULL; + OPAL_LIST_FOREACH_SAFE(req, rqnext, &pending, orte_data_req_t) { + if (req->uid != data->uid) { + continue; + } + if (req->range != data->range) { + continue; + } + for (i=0; NULL != req->keys[i]; i++) { + /* cycle thru the data keys for matches */ + OPAL_LIST_FOREACH(iptr, &data->values, opal_value_t) { + if (0 == strcmp(iptr->key, req->keys[i])) { + /* found it - package it for return */ + if (NULL == reply) { + reply = OBJ_NEW(opal_buffer_t); + ret = ORTE_SUCCESS; + if (ORTE_SUCCESS != (rc = opal_dss.pack(reply, &ret, 1, OPAL_INT))) { + ORTE_ERROR_LOG(rc); + break; + } + } + if (ORTE_SUCCESS != (rc = opal_dss.pack(reply, &data->owner, 1, OPAL_NAME))) { + ORTE_ERROR_LOG(rc); + break; + } + if (ORTE_SUCCESS != (rc = opal_dss.pack(reply, &iptr, 1, OPAL_VALUE))) { + ORTE_ERROR_LOG(rc); + break; + } + } + } + } + if (NULL != reply) { + /* send it back to the requestor */ + if (0 > (rc = orte_rml.send_buffer_nb(&req->requestor, reply, ORTE_RML_TAG_DATA_CLIENT, + orte_rml_send_callback, NULL))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(reply); + } + /* remove this request */ + opal_list_remove_item(&pending, &req->super); + OBJ_RELEASE(req); + reply = NULL; + } + } + /* tell the user it was wonderful... */ ret = ORTE_SUCCESS; if (ORTE_SUCCESS != (rc = opal_dss.pack(answer, &ret, 1, OPAL_INT))) { @@ -247,28 +304,6 @@ void orte_data_server(int status, orte_process_name_t* sender, goto SEND_ERROR; } - /* unpack the number of info elements */ - count = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &ninfo, &count, OPAL_UINT32))) { - ORTE_ERROR_LOG(rc); - goto SEND_ERROR; - } - if (0 < ninfo) { - for (i=0; i < ninfo; i++) { - count = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &iptr, &count, OPAL_VALUE))) { - ORTE_ERROR_LOG(rc); - goto SEND_ERROR; - } - /* if this is the userid, separate it out */ - if (0 == strcmp(iptr->key, OPAL_PMIX_USERID)) { - uid = iptr->data.uint32; - } - /* ignore anything else for now */ - OBJ_RELEASE(iptr); - } - } - /* unpack the number of keys */ count = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &ninfo, &count, OPAL_UINT32))) { @@ -277,6 +312,7 @@ void orte_data_server(int status, orte_process_name_t* sender, } if (0 == ninfo) { /* they forgot to send us the keys?? */ + ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); rc = ORTE_ERR_BAD_PARAM; goto SEND_ERROR; } @@ -293,7 +329,27 @@ void orte_data_server(int status, orte_process_name_t* sender, free(str); } + /* unpack any info elements */ + count = 1; + while (ORTE_SUCCESS == (rc = opal_dss.unpack(buffer, &iptr, &count, OPAL_VALUE))) { + /* if this is the userid, separate it out */ + if (0 == strcmp(iptr->key, OPAL_PMIX_USERID)) { + uid = iptr->data.uint32; + } else if (0 == strcmp(iptr->key, OPAL_PMIX_WAIT)) { + /* flag that we wait until the data is present */ + wait = true; + } + /* ignore anything else for now */ + OBJ_RELEASE(iptr); + } + if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) { + ORTE_ERROR_LOG(rc); + opal_argv_free(keys); + goto SEND_ERROR; + } + /* cycle across the provided keys */ + ret_packed = false; for (i=0; NULL != keys[i]; i++) { /* cycle across the stored data, looking for a match */ for (k=0; k < orte_data_server_store.size; k++) { @@ -336,12 +392,23 @@ void orte_data_server(int status, orte_process_name_t* sender, } } } - opal_argv_free(keys); if (!ret_packed) { + /* if we were told to wait for the data, then queue this up + * for later processing */ + if (wait) { + req = OBJ_NEW(orte_data_req_t); + req->requestor = *sender; + req->uid = uid; + req->range = range; + req->keys = keys; + return; + } /* nothing was found - indicate that situation */ rc = ORTE_ERR_NOT_FOUND; + opal_argv_free(keys); goto SEND_ERROR; } + opal_argv_free(keys); goto SEND_ANSWER; break; @@ -365,28 +432,6 @@ void orte_data_server(int status, orte_process_name_t* sender, goto SEND_ERROR; } - /* unpack the number of info elements */ - count = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &ninfo, &count, OPAL_UINT32))) { - ORTE_ERROR_LOG(rc); - goto SEND_ERROR; - } - if (0 < ninfo) { - for (i=0; i < ninfo; i++) { - count = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &iptr, &count, OPAL_VALUE))) { - ORTE_ERROR_LOG(rc); - goto SEND_ERROR; - } - /* if this is the userid, separate it out */ - if (0 == strcmp(iptr->key, OPAL_PMIX_USERID)) { - uid = iptr->data.uint32; - } - /* ignore anything else for now */ - OBJ_RELEASE(iptr); - } - } - /* unpack the number of keys */ count = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &ninfo, &count, OPAL_UINT32))) { @@ -411,6 +456,22 @@ void orte_data_server(int status, orte_process_name_t* sender, free(str); } + /* unpack any info elements */ + count = 1; + while (ORTE_SUCCESS == (rc = opal_dss.unpack(buffer, &iptr, &count, OPAL_VALUE))) { + /* if this is the userid, separate it out */ + if (0 == strcmp(iptr->key, OPAL_PMIX_USERID)) { + uid = iptr->data.uint32; + } + /* ignore anything else for now */ + OBJ_RELEASE(iptr); + } + if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) { + ORTE_ERROR_LOG(rc); + opal_argv_free(keys); + goto SEND_ERROR; + } + /* cycle across the provided keys */ for (i=0; NULL != keys[i]; i++) { /* cycle across the stored data, looking for a match */ @@ -463,6 +524,10 @@ void orte_data_server(int status, orte_process_name_t* sender, } SEND_ERROR: + OPAL_OUTPUT_VERBOSE((1, orte_debug_output, + "%s data server: sending error %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_ERROR_NAME(rc))); /* pack the error code */ if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &rc, 1, OPAL_INT))) { ORTE_ERROR_LOG(ret); diff --git a/orte/tools/orte-submit/orte-submit.c b/orte/tools/orte-submit/orte-submit.c index 6e45624a9d..8c9cf0875f 100644 --- a/orte/tools/orte-submit/orte-submit.c +++ b/orte/tools/orte-submit/orte-submit.c @@ -1046,7 +1046,7 @@ static int create_app(int argc, char* argv[], app->env = opal_argv_copy(*app_env); if (ORTE_SUCCESS != (rc = orte_schizo.parse_env(myglobals.personality, myglobals.path, - &cmd_line, NULL, + &cmd_line, environ, &app->env))) { goto cleanup; } diff --git a/orte/tools/orterun/orterun.c b/orte/tools/orterun/orterun.c index 342a1715ce..aa6b35085e 100644 --- a/orte/tools/orterun/orterun.c +++ b/orte/tools/orterun/orterun.c @@ -158,7 +158,6 @@ void* MPIR_Breakpoint(void) static char **global_mca_env = NULL; static orte_std_cntr_t total_num_apps = 0; static bool want_prefix_by_default = (bool) ORTE_WANT_ORTERUN_PREFIX_BY_DEFAULT; -static char *ompi_server=NULL; /* * Globals @@ -284,16 +283,10 @@ static opal_cmd_line_init_t cmd_line_init[] = { NULL, OPAL_CMD_LINE_TYPE_BOOL, "Do not attempt to resolve interfaces" }, - /* uri of Open MPI server, or at least where to get it */ - { NULL, '\0', "ompi-server", "ompi-server", 1, - &orterun_globals.ompi_server, OPAL_CMD_LINE_TYPE_STRING, - "Specify the URI of the Open MPI server, or the name of the file (specified as file:filename) that contains that info" }, - { NULL, '\0', "wait-for-server", "wait-for-server", 0, - &orterun_globals.wait_for_server, OPAL_CMD_LINE_TYPE_BOOL, - "If ompi-server is not already running, wait until it is detected (default: false)" }, - { NULL, '\0', "server-wait-time", "server-wait-time", 1, - &orterun_globals.server_wait_timeout, OPAL_CMD_LINE_TYPE_INT, - "Time in seconds to wait for ompi-server (default: 10 sec)" }, + /* uri of PMIx publish/lookup server, or at least where to get it */ + { "pmix_server_uri", '\0', "ompi-server", "ompi-server", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Specify the URI of the publish/lookup server, or the name of the file (specified as file:filename) that contains that info" }, { "carto_file_path", '\0', "cf", "cartofile", 1, NULL, OPAL_CMD_LINE_TYPE_STRING, @@ -1041,42 +1034,6 @@ int orterun(int argc, char *argv[]) goto DONE; } - /* if an uri for the ompi-server was provided, set the route */ - if (NULL != ompi_server) { - opal_buffer_t buf; - /* setup our route to the server */ - OBJ_CONSTRUCT(&buf, opal_buffer_t); - opal_dss.pack(&buf, &ompi_server, 1, OPAL_STRING); - if (ORTE_SUCCESS != (rc = orte_rml_base_update_contact_info(&buf))) { - ORTE_ERROR_LOG(rc); - ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); - goto DONE; - } - OBJ_DESTRUCT(&buf); - /* check if we are to wait for the server to start - resolves - * a race condition that can occur when the server is run - * as a background job - e.g., in scripts - */ - if (orterun_globals.wait_for_server) { - /* ping the server */ - struct timeval timeout; - timeout.tv_sec = orterun_globals.server_wait_timeout; - timeout.tv_usec = 0; - if (ORTE_SUCCESS != (rc = orte_rml.ping(ompi_server, &timeout))) { - /* try it one more time */ - if (ORTE_SUCCESS != (rc = orte_rml.ping(ompi_server, &timeout))) { - /* okay give up */ - orte_show_help("help-orterun.txt", "orterun:server-not-found", true, - orte_basename, ompi_server, - (long)orterun_globals.server_wait_timeout, - ORTE_ERROR_NAME(rc)); - ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); - goto DONE; - } - } - } - } - /* setup for debugging */ orte_debugger_init_before_spawn(jdata); orte_state.add_job_state(ORTE_JOB_STATE_READY_FOR_DEBUGGERS, @@ -1175,9 +1132,6 @@ static int init_globals(void) orterun_globals.appfile = NULL; orterun_globals.wdir = NULL; orterun_globals.path = NULL; - orterun_globals.ompi_server = NULL; - orterun_globals.wait_for_server = false; - orterun_globals.server_wait_timeout = 10; orterun_globals.stdin_target = "0"; orterun_globals.report_pid = NULL; orterun_globals.report_uri = NULL; @@ -1270,132 +1224,7 @@ static int parse_locals(orte_job_t *jdata, int argc, char* argv[]) bool made_app; orte_std_cntr_t j, size1; - /* if the ompi-server was given, then set it up here */ - if (NULL != orterun_globals.ompi_server) { - /* someone could have passed us a file instead of a uri, so - * we need to first check to see what we have - if it starts - * with "file", then we know it is a file. Otherwise, we assume - * it is a uri as provided by the ompi-server's output - * of an ORTE-standard string. Note that this is NOT a standard - * uri as it starts with the process name! - */ - if (0 == strncmp(orterun_globals.ompi_server, "file", strlen("file")) || - 0 == strncmp(orterun_globals.ompi_server, "FILE", strlen("FILE"))) { - char input[1024], *filename; - FILE *fp; - - /* it is a file - get the filename */ - filename = strchr(orterun_globals.ompi_server, ':'); - if (NULL == filename) { - /* filename is not correctly formatted */ - orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-bad", true, - orte_basename, orterun_globals.ompi_server); - exit(1); - } - ++filename; /* space past the : */ - - if (0 >= strlen(filename)) { - /* they forgot to give us the name! */ - orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-missing", true, - orte_basename, orterun_globals.ompi_server); - exit(1); - } - - /* open the file and extract the uri */ - fp = fopen(filename, "r"); - if (NULL == fp) { /* can't find or read file! */ - orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-access", true, - orte_basename, orterun_globals.ompi_server); - exit(1); - } - if (NULL == fgets(input, 1024, fp)) { - /* something malformed about file */ - fclose(fp); - orte_show_help("help-orterun.txt", "orterun:ompi-server-file-bad", true, - orte_basename, orterun_globals.ompi_server, - orte_basename); - exit(1); - } - fclose(fp); - input[strlen(input)-1] = '\0'; /* remove newline */ - ompi_server = strdup(input); - } else if (0 == strncmp(orterun_globals.ompi_server, "pid", strlen("pid")) || - 0 == strncmp(orterun_globals.ompi_server, "PID", strlen("PID"))) { - opal_list_t hnp_list; - opal_list_item_t *item; - orte_hnp_contact_t *hnp; - char *ptr; - pid_t pid; - - ptr = strchr(orterun_globals.ompi_server, ':'); - if (NULL == ptr) { - /* pid is not correctly formatted */ - orte_show_help("help-orterun.txt", "orterun:ompi-server-pid-bad", true, - orte_basename, orte_basename, - orterun_globals.ompi_server, orte_basename); - exit(1); - } - ++ptr; /* space past the : */ - - if (0 >= strlen(ptr)) { - /* they forgot to give us the pid! */ - orte_show_help("help-orterun.txt", "orterun:ompi-server-pid-bad", true, - orte_basename, orte_basename, - orterun_globals.ompi_server, orte_basename); - exit(1); - } - - pid = strtoul(ptr, NULL, 10); - - /* to search the local mpirun's, we have to partially initialize the - * orte_process_info structure. This won't fully be setup until orte_init, - * but we finagle a little bit of it here - */ - if (ORTE_SUCCESS != (rc = orte_session_dir_get_name(NULL, &orte_process_info.tmpdir_base, - &orte_process_info.top_session_dir, - NULL, NULL, NULL))) { - orte_show_help("help-orterun.txt", "orterun:ompi-server-could-not-get-hnp-list", true, - orte_basename, orte_basename); - exit(1); - } - - OBJ_CONSTRUCT(&hnp_list, opal_list_t); - - /* get the list of HNPs, but do -not- setup contact info to them in the RML */ - if (ORTE_SUCCESS != (rc = orte_list_local_hnps(&hnp_list, false))) { - orte_show_help("help-orterun.txt", "orterun:ompi-server-could-not-get-hnp-list", true, - orte_basename, orte_basename); - exit(1); - } - - /* search the list for the desired pid */ - while (NULL != (item = opal_list_remove_first(&hnp_list))) { - hnp = (orte_hnp_contact_t*)item; - if (pid == hnp->pid) { - ompi_server = strdup(hnp->rml_uri); - goto hnp_found; - } - OBJ_RELEASE(item); - } - /* if we got here, it wasn't found */ - orte_show_help("help-orterun.txt", "orterun:ompi-server-pid-not-found", true, - orte_basename, orte_basename, pid, orterun_globals.ompi_server, - orte_basename); - OBJ_DESTRUCT(&hnp_list); - exit(1); - hnp_found: - /* cleanup rest of list */ - while (NULL != (item = opal_list_remove_first(&hnp_list))) { - OBJ_RELEASE(item); - } - OBJ_DESTRUCT(&hnp_list); - } else { - ompi_server = strdup(orterun_globals.ompi_server); - } - } - /* Make the apps */ - temp_argc = 0; temp_argv = NULL; opal_argv_append(&temp_argc, &temp_argv, argv[0]); @@ -1640,7 +1469,7 @@ static int create_app(int argc, char* argv[], app->env = opal_argv_copy(*app_env); if (ORTE_SUCCESS != (rc = orte_schizo.parse_env(orterun_globals.personality, orterun_globals.path, - &cmd_line, ompi_server, + &cmd_line, environ, &app->env))) { goto cleanup; } diff --git a/orte/tools/orterun/orterun.h b/orte/tools/orterun/orterun.h index fe11e56839..c25bfcd96a 100644 --- a/orte/tools/orterun/orterun.h +++ b/orte/tools/orterun/orterun.h @@ -51,9 +51,6 @@ struct orterun_globals_t { char *path; char *preload_files; bool sleep; - char *ompi_server; - bool wait_for_server; - int server_wait_timeout; char *stdin_target; char *prefix; char *path_to_mpirun;