1
1

For large scale systems, we would like to avoid doing a full modex during MPI_Init so that launch will scale a little better. At the moment, our options are somewhat limited as only a few BTLs don't immediately call modex_recv on all procs during startup. However, for those situations where someone can take advantage of it, add the ability to do a "modex on demand" retrieval of data from remote procs when we launch via mpirun.

NOTE: launch performance will be absolutely awful if you do this with BTLs that aren't configured to modex_recv on first message!

Even with "modex on demand", we still have to do a barrier in place of the modex - we simply don't move any data around, which does reduce the time impact. The barrier is required to ensure that the other proc has in fact registered all its BTL info and therefore is prepared to hand over a complete data package. Otherwise, you may not get the info you need. In addition, the shared memory BTL can fail to properly rendezvous as it expects the barrier to be in place.

This behavior will *only* take effect under the following conditions:

1. launched via mpirun

2. #procs is greater than ompi_hostname_cutoff, which defaults to UINT32_MAX

3. mca param rte_orte_direct_modex is set to 1. At the moment, we are having problems getting this param to register properly, so only the first two conditions are in effect. Still, the bottom line is you have to *want* this behavior to get it.

The planned next evolution of this will be to make the direct modex be non-blocking - this will require two fixes:

1. if the remote proc doesn't have the required info, then let it delay its response until it does. This means we need a way for the MPI layer to tell the RTE "I am done entering modex data".

2. adjust the SM rendezvous logic to loop until the required file has been created

Creating a placeholder to bring this over to 1.7.5 when ready.

cmr=v1.7.5:reviewer=hjelmn:subject=Enable direct modex at scale

This commit was SVN r30259.
Этот коммит содержится в:
Ralph Castain 2014-01-11 17:36:06 +00:00
родитель cc4440147f
Коммит 286ff6d552
7 изменённых файлов: 205 добавлений и 11 удалений

Просмотреть файл

@ -1,7 +1,7 @@
/* /*
* Copyright (c) 2012-2013 Los Alamos National Security, LLC. * Copyright (c) 2012-2013 Los Alamos National Security, LLC.
* All rights reserved. * All rights reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved * Copyright (c) 2013-2014 Intel, Inc. All rights reserved
* *
* $COPYRIGHT$ * $COPYRIGHT$
* *
@ -62,7 +62,7 @@ typedef orte_ns_cmp_bitmask_t ompi_rte_cmp_bitmask_t;
/* Collective objects and operations */ /* Collective objects and operations */
#define ompi_rte_collective_t orte_grpcomm_collective_t #define ompi_rte_collective_t orte_grpcomm_collective_t
typedef orte_grpcomm_coll_id_t ompi_rte_collective_id_t; typedef orte_grpcomm_coll_id_t ompi_rte_collective_id_t;
#define ompi_rte_modex(a) orte_grpcomm.modex(a) OMPI_DECLSPEC int ompi_rte_modex(ompi_rte_collective_t *coll);
#define ompi_rte_barrier(a) orte_grpcomm.barrier(a) #define ompi_rte_barrier(a) orte_grpcomm.barrier(a)
/* Process info struct and values */ /* Process info struct and values */
@ -122,6 +122,9 @@ typedef orte_rml_tag_t ompi_rml_tag_t;
#define OMPI_RML_PERSISTENT ORTE_RML_PERSISTENT #define OMPI_RML_PERSISTENT ORTE_RML_PERSISTENT
#define OMPI_RML_NON_PERSISTENT ORTE_RML_NON_PERSISTENT #define OMPI_RML_NON_PERSISTENT ORTE_RML_NON_PERSISTENT
/* define a local variable shared between component and module */
OMPI_MODULE_DECLSPEC extern bool ompi_rte_orte_direct_modex;
END_C_DECLS END_C_DECLS
#endif /* MCA_OMPI_RTE_ORTE_H */ #endif /* MCA_OMPI_RTE_ORTE_H */

Просмотреть файл

@ -20,6 +20,8 @@
#include "ompi/mca/rte/rte.h" #include "ompi/mca/rte/rte.h"
#include "rte_orte.h" #include "rte_orte.h"
bool ompi_rte_orte_direct_modex;
/* /*
* Public string showing the component version number * Public string showing the component version number
*/ */
@ -30,6 +32,7 @@ const char *ompi_rte_orte_component_version_string =
* Local function * Local function
*/ */
static int rte_orte_open(void); static int rte_orte_open(void);
static int rte_orte_register(void);
/* /*
* Instantiate the public struct with all of our public information * Instantiate the public struct with all of our public information
@ -52,7 +55,9 @@ const ompi_rte_component_t mca_rte_orte_component = {
/* Component open and close functions */ /* Component open and close functions */
rte_orte_open, rte_orte_open,
NULL NULL,
NULL,
rte_orte_register
}, },
{ {
/* The component is checkpoint ready */ /* The component is checkpoint ready */
@ -66,3 +71,14 @@ static int rte_orte_open(void)
return OPAL_SUCCESS; return OPAL_SUCCESS;
} }
static int rte_orte_register(void)
{
ompi_rte_orte_direct_modex = false;
(void) mca_base_component_var_register (&mca_rte_orte_component.base_version,
"direct_modex", "Enable direct modex (default: false)",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY, &ompi_rte_orte_direct_modex);
return OMPI_SUCCESS;
}

Просмотреть файл

@ -1,7 +1,7 @@
/* /*
* Copyright (c) 2012-2013 Los Alamos National Security, LLC. * Copyright (c) 2012-2013 Los Alamos National Security, LLC.
* All rights reserved. * All rights reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved * Copyright (c) 2013-2014 Intel, Inc. All rights reserved
*/ */
#include "ompi_config.h" #include "ompi_config.h"
#include "ompi/constants.h" #include "ompi/constants.h"
@ -17,7 +17,7 @@
#include "orte/mca/errmgr/errmgr.h" #include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ess/ess.h" #include "orte/mca/ess/ess.h"
#include "orte/mca/grpcomm/grpcomm.h" #include "orte/mca/grpcomm/base/base.h"
#include "orte/mca/odls/odls.h" #include "orte/mca/odls/odls.h"
#include "orte/mca/plm/plm.h" #include "orte/mca/plm/plm.h"
#include "orte/mca/rml/rml.h" #include "orte/mca/rml/rml.h"
@ -37,6 +37,7 @@
#include "ompi/mca/rte/rte.h" #include "ompi/mca/rte/rte.h"
#include "ompi/debuggers/debuggers.h" #include "ompi/debuggers/debuggers.h"
#include "ompi/proc/proc.h" #include "ompi/proc/proc.h"
#include "ompi/runtime/params.h"
void ompi_rte_abort(int error_code, char *fmt, ...) void ompi_rte_abort(int error_code, char *fmt, ...)
{ {
@ -127,15 +128,47 @@ void ompi_rte_wait_for_debugger(void)
/* VPID 0 waits for a message from the HNP */ /* VPID 0 waits for a message from the HNP */
OBJ_CONSTRUCT(&xfer, orte_rml_recv_cb_t); OBJ_CONSTRUCT(&xfer, orte_rml_recv_cb_t);
xfer.active = true;
orte_rml.recv_buffer_nb(OMPI_NAME_WILDCARD, orte_rml.recv_buffer_nb(OMPI_NAME_WILDCARD,
ORTE_RML_TAG_DEBUGGER_RELEASE, ORTE_RML_TAG_DEBUGGER_RELEASE,
ORTE_RML_NON_PERSISTENT, ORTE_RML_NON_PERSISTENT,
orte_rml_recv_callback, &xfer); orte_rml_recv_callback, &xfer);
xfer.active = true; /* let the MPI progress engine run while we wait */
ORTE_WAIT_FOR_COMPLETION(xfer.active); OMPI_WAIT_FOR_COMPLETION(xfer.active);
} }
} }
int ompi_rte_modex(ompi_rte_collective_t *coll)
{
if ((orte_process_info.num_procs < ompi_hostname_cutoff) ||
!ompi_rte_orte_direct_modex ||
orte_standalone_operation) {
/* if we are direct launched and/or below a user-specified
* cutoff value, then we just fall thru to the ORTE modex
*/
OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_framework.framework_output,
"%s running modex",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
return orte_grpcomm.modex(coll);
}
/* if the user defined a cutoff value that we are larger
* than, and if we were not direct launched, then skip
* the modex operation. We already have all the RTE-level
* info we need, and we will retrieve the MPI-level info
* only as requested. This will provide a faster startup
* time since we won't do a massive allgather operation,
* but will make first-message connections slower. However,
* we still have to do a barrier op here to ensure that all
* procs have had time to store their modex info prior to
* receiving a request to provide it!
*/
OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_framework.framework_output,
"%s using direct modex - executing barrier",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
return orte_grpcomm.barrier(coll);
}
int ompi_rte_db_store(const orte_process_name_t *nm, const char* key, int ompi_rte_db_store(const orte_process_name_t *nm, const char* key,
const void *data, opal_data_type_t type) const void *data, opal_data_type_t type)
{ {
@ -143,6 +176,47 @@ int ompi_rte_db_store(const orte_process_name_t *nm, const char* key,
return opal_db.store((opal_identifier_t*)nm, OPAL_SCOPE_GLOBAL, key, data, type); return opal_db.store((opal_identifier_t*)nm, OPAL_SCOPE_GLOBAL, key, data, type);
} }
static int direct_modex(orte_process_name_t *peer, opal_scope_t scope)
{
int rc;
orte_rml_recv_cb_t xfer;
opal_buffer_t *buf;
OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_framework.framework_output,
"%s requesting direct modex from %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(peer)));
buf = OBJ_NEW(opal_buffer_t);
/* pack the scope of the request */
if (OPAL_SUCCESS != (rc = opal_dss.pack(buf, &scope, 1, OPAL_DATA_SCOPE_T))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buf);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_rml.send_buffer_nb(peer, buf,
ORTE_RML_TAG_DIRECT_MODEX,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buf);
return rc;
}
OBJ_CONSTRUCT(&xfer, orte_rml_recv_cb_t);
xfer.active = true;
orte_rml.recv_buffer_nb(OMPI_NAME_WILDCARD,
ORTE_RML_TAG_DIRECT_MODEX_RESP,
ORTE_RML_NON_PERSISTENT,
orte_rml_recv_callback, &xfer);
OMPI_WAIT_FOR_COMPLETION(xfer.active);
/* got it - this is a std modex package, so unpack it with the
* grpcomm function and cache it locally so we can quickly get
* more pieces if necessary
*/
orte_grpcomm_base_store_modex(&xfer.data, NULL);
return ORTE_SUCCESS;
}
int ompi_rte_db_fetch(const struct ompi_proc_t *proc, int ompi_rte_db_fetch(const struct ompi_proc_t *proc,
const char *key, const char *key,
void **data, opal_data_type_t type) void **data, opal_data_type_t type)
@ -150,7 +224,18 @@ int ompi_rte_db_fetch(const struct ompi_proc_t *proc,
int rc; int rc;
if (OPAL_SUCCESS != (rc = opal_db.fetch((opal_identifier_t*)(&proc->proc_name), key, data, type))) { if (OPAL_SUCCESS != (rc = opal_db.fetch((opal_identifier_t*)(&proc->proc_name), key, data, type))) {
return rc; /* if we couldn't fetch the data via the db, then we will attempt
* to retrieve it from the target proc
*/
if (ORTE_SUCCESS != (rc = direct_modex((orte_process_name_t*)&proc->proc_name, OPAL_SCOPE_PEER))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* now retrieve the requested piece */
if (OPAL_SUCCESS != (rc = opal_db.fetch((opal_identifier_t*)(&proc->proc_name), key, data, type))) {
ORTE_ERROR_LOG(rc);
return rc;
}
} }
/* update the hostname upon first call to modex-recv for this proc */ /* update the hostname upon first call to modex-recv for this proc */
if (NULL == proc->proc_hostname) { if (NULL == proc->proc_hostname) {
@ -166,7 +251,18 @@ int ompi_rte_db_fetch_pointer(const struct ompi_proc_t *proc,
int rc; int rc;
if (OPAL_SUCCESS != (rc = opal_db.fetch_pointer((opal_identifier_t*)(&proc->proc_name), key, data, type))) { if (OPAL_SUCCESS != (rc = opal_db.fetch_pointer((opal_identifier_t*)(&proc->proc_name), key, data, type))) {
return rc; /* if we couldn't fetch the data via the db, then we will attempt
* to retrieve it from the target proc
*/
if (ORTE_SUCCESS != (rc = direct_modex((orte_process_name_t*)&proc->proc_name, OPAL_SCOPE_PEER))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* now retrieve the requested piece */
if (OPAL_SUCCESS != (rc = opal_db.fetch_pointer((opal_identifier_t*)(&proc->proc_name), key, data, type))) {
ORTE_ERROR_LOG(rc);
return rc;
}
} }
/* update the hostname upon first call to modex-recv for this proc */ /* update the hostname upon first call to modex-recv for this proc */
if (NULL == proc->proc_hostname) { if (NULL == proc->proc_hostname) {
@ -184,7 +280,19 @@ int ompi_rte_db_fetch_multiple(const struct ompi_proc_t *proc,
/* MPI processes are only concerned with shared info */ /* MPI processes are only concerned with shared info */
if (OPAL_SUCCESS != (rc = opal_db.fetch_multiple((opal_identifier_t*)(&proc->proc_name), if (OPAL_SUCCESS != (rc = opal_db.fetch_multiple((opal_identifier_t*)(&proc->proc_name),
OPAL_SCOPE_GLOBAL, key, kvs))) { OPAL_SCOPE_GLOBAL, key, kvs))) {
return rc; /* if we couldn't fetch the data via the db, then we will attempt
* to retrieve it from the target proc
*/
if (ORTE_SUCCESS != (rc = direct_modex((orte_process_name_t*)&proc->proc_name, OPAL_SCOPE_GLOBAL))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* now retrieve the requested pieces */
if (OPAL_SUCCESS != (rc = opal_db.fetch_multiple((opal_identifier_t*)(&proc->proc_name),
OPAL_SCOPE_GLOBAL, key, kvs))) {
ORTE_ERROR_LOG(rc);
return rc;
}
} }
/* update the hostname upon first call to modex-recv for this proc */ /* update the hostname upon first call to modex-recv for this proc */
if (NULL == proc->proc_hostname) { if (NULL == proc->proc_hostname) {

Просмотреть файл

@ -339,7 +339,7 @@ int ompi_mpi_register_params(void)
(void) mca_base_var_register ("ompi", "ompi", NULL, "hostname_cutoff", (void) mca_base_var_register ("ompi", "ompi", NULL, "hostname_cutoff",
"If the number of processes in the application exceeds the provided value," "If the number of processes in the application exceeds the provided value,"
"hostnames for remote processes will not be retrieved by applications [default: UINT32_MAX]", "hostnames for remote processes will not be retrieved by applications [default: UINT32_MAX]",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0,
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY,
&ompi_hostname_cutoff); &ompi_hostname_cutoff);

Просмотреть файл

@ -247,6 +247,10 @@ void orte_grpcomm_base_store_modex(opal_buffer_t *rbuf, void *cbdata)
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
goto cleanup; goto cleanup;
} }
OPAL_OUTPUT_VERBOSE((10, orte_grpcomm_base_framework.framework_output,
"%s STORING MODEX DATA FROM %s FOR %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&pname), kv->key));
/* if this is me, dump the data - we already have it in the db */ /* if this is me, dump the data - we already have it in the db */
if (ORTE_PROC_MY_NAME->jobid == pname.jobid && if (ORTE_PROC_MY_NAME->jobid == pname.jobid &&
ORTE_PROC_MY_NAME->vpid == pname.vpid) { ORTE_PROC_MY_NAME->vpid == pname.vpid) {
@ -318,6 +322,9 @@ int orte_grpcomm_base_pack_modex_entries(opal_buffer_t *buf, opal_scope_t scope)
/* if there are entries, store them */ /* if there are entries, store them */
while (NULL != (kv = (opal_value_t*)opal_list_remove_first(&data))) { while (NULL != (kv = (opal_value_t*)opal_list_remove_first(&data))) {
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output,
"%s grpcomm:base:pack_modex: packing entry for %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), kv->key));
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &kv, 1, OPAL_VALUE))) { if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &kv, 1, OPAL_VALUE))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
break; break;

Просмотреть файл

@ -54,6 +54,9 @@ static void daemon_coll_recv(int status, orte_process_name_t* sender,
static void app_recv(int status, orte_process_name_t* sender, static void app_recv(int status, orte_process_name_t* sender,
opal_buffer_t* buffer, orte_rml_tag_t tag, opal_buffer_t* buffer, orte_rml_tag_t tag,
void* cbdata); void* cbdata);
static void direct_modex(int status, orte_process_name_t* sender,
opal_buffer_t* buffer, orte_rml_tag_t tag,
void* cbdata);
static void coll_id_req(int status, orte_process_name_t* sender, static void coll_id_req(int status, orte_process_name_t* sender,
opal_buffer_t* buffer, orte_rml_tag_t tag, opal_buffer_t* buffer, orte_rml_tag_t tag,
void* cbdata); void* cbdata);
@ -96,6 +99,10 @@ int orte_grpcomm_base_comm_start(void)
ORTE_RML_TAG_COLLECTIVE, ORTE_RML_TAG_COLLECTIVE,
ORTE_RML_PERSISTENT, ORTE_RML_PERSISTENT,
app_recv, NULL); app_recv, NULL);
orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
ORTE_RML_TAG_DIRECT_MODEX,
ORTE_RML_PERSISTENT,
direct_modex, NULL);
recv_issued = true; recv_issued = true;
} }
} }
@ -117,6 +124,7 @@ void orte_grpcomm_base_comm_stop(void)
} }
if (ORTE_PROC_IS_HNP) { if (ORTE_PROC_IS_HNP) {
orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_COLL_ID_REQ); orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_COLL_ID_REQ);
orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DIRECT_MODEX);
} }
recv_issued = false; recv_issued = false;
} }
@ -307,6 +315,54 @@ static void app_recv(int status, orte_process_name_t* sender,
} }
} }
static void direct_modex(int status, orte_process_name_t* sender,
opal_buffer_t* buffer, orte_rml_tag_t tag,
void* cbdata)
{
opal_buffer_t *buf;
int rc, cnt;
opal_scope_t scope;
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output,
"%s providing direct modex for %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(sender)));
/* we always must send a response, even if nothing could be
* returned, to prevent the remote proc from hanging
*/
buf = OBJ_NEW(opal_buffer_t);
/* get the desired scope */
cnt = 1;
if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &scope, &cnt, OPAL_DATA_SCOPE_T))) {
ORTE_ERROR_LOG(rc);
goto respond;
}
/* pack our process name so the remote end can use the std
* unpacking routine
*/
if (OPAL_SUCCESS != (rc = opal_dss.pack(buf, ORTE_PROC_MY_NAME, 1, ORTE_NAME))) {
ORTE_ERROR_LOG(rc);
goto respond;
}
/* collect the desired data */
if (ORTE_SUCCESS != (rc = orte_grpcomm_base_pack_modex_entries(buf, scope))) {
ORTE_ERROR_LOG(rc);
}
respond:
if (ORTE_SUCCESS != (rc = orte_rml.send_buffer_nb(sender, buf,
ORTE_RML_TAG_DIRECT_MODEX_RESP,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buf);
}
}
/**** DAEMON COLLECTIVE SUPPORT ****/ /**** DAEMON COLLECTIVE SUPPORT ****/
/* recv for collective messages sent from a daemon's local procs */ /* recv for collective messages sent from a daemon's local procs */
static void daemon_local_recv(int status, orte_process_name_t* sender, static void daemon_local_recv(int status, orte_process_name_t* sender,

Просмотреть файл

@ -142,6 +142,10 @@ BEGIN_C_DECLS
/* sensor data */ /* sensor data */
#define ORTE_RML_TAG_SENSOR_DATA 47 #define ORTE_RML_TAG_SENSOR_DATA 47
/* direct modex support */
#define ORTE_RML_TAG_DIRECT_MODEX 48
#define ORTE_RML_TAG_DIRECT_MODEX_RESP 49
#define ORTE_RML_TAG_MAX 100 #define ORTE_RML_TAG_MAX 100