1
1

ompi/proc: add function to get all allocated procs

This commit adds two new functions:

 - ompi_proc_get_allocated - Returns all procs in the current job that
   have already been allocated. This is used in init/finalize to
   determine which procs to pass to add_procs/del_procs.

 - ompi_proc_world_size - returns the number of processes in
   MPI_COMM_WORLD. This may be removed in favor of callers just
   looking at ompi_process_info.

The behavior of ompi_proc_world has been restored to return
ompi_proc_t's for all processes in the current job. The use of this
function is discouraged.

Code that was using ompi_proc_world() has been updated to make use of
the new functions to avoid the memory overhead of ompi_comm_world ().

Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov>
Этот коммит содержится в:
Nathan Hjelm 2015-09-23 13:48:37 -06:00
родитель db74fa9d0f
Коммит 2c89c7f47d
5 изменённых файлов: 108 добавлений и 20 удалений

Просмотреть файл

@ -304,27 +304,30 @@ int ompi_mtl_mxm_module_init(void)
}
#endif
if (NULL == (procs = ompi_proc_world(&totps))) {
MXM_ERROR("Unable to obtain process list");
return OMPI_ERROR;
}
totps = ompi_proc_world_size ();
if (totps < (size_t)ompi_mtl_mxm.mxm_np) {
MXM_VERBOSE(1, "MXM support will be disabled because of total number "
"of processes (%lu) is less than the minimum set by the "
"mtl_mxm_np MCA parameter (%u)", totps, ompi_mtl_mxm.mxm_np);
free(procs);
return OMPI_ERR_NOT_SUPPORTED;
}
MXM_VERBOSE(1, "MXM support enabled");
if (ORTE_NODE_RANK_INVALID == (lr = ompi_process_info.my_node_rank)) {
MXM_ERROR("Unable to obtain local node rank");
free(procs);
return OMPI_ERROR;
}
nlps = ompi_process_info.num_local_peers + 1;
/* local procs are always allocated. if that ever changes this will need to
* be modified. */
procs = ompi_proc_get_allocated (&totps);
if (NULL == procs) {
MXM_ERROR("Unable to obtain process list");
return OMPI_ERROR;
}
for (proc = 0; proc < totps; proc++) {
if (OPAL_PROC_ON_LOCAL_NODE(procs[proc]->super.proc_flags)) {
mxlr = max(mxlr, procs[proc]->super.proc_name.vpid);
@ -595,14 +598,8 @@ int ompi_mtl_mxm_del_procs(struct mca_mtl_base_module_t *mtl, size_t nprocs,
size_t i;
#if MXM_API >= MXM_VERSION(3,1)
if (ompi_mtl_mxm.bulk_disconnect) {
size_t nprocs_world;
ompi_proc_t **procs;
procs = ompi_proc_world(&nprocs_world);
if (nprocs == nprocs_world) {
mxm_ep_powerdown(ompi_mtl_mxm.ep);
}
free(procs);
if (ompi_mtl_mxm.bulk_disconnect && nprocs == ompi_proc_world_size ()) {
mxm_ep_powerdown(ompi_mtl_mxm.ep);
}
#endif

Просмотреть файл

@ -399,7 +399,12 @@ int ompi_proc_finalize (void)
return OMPI_SUCCESS;
}
ompi_proc_t** ompi_proc_world(size_t *size)
int ompi_proc_world_size (void)
{
return ompi_process_info.num_procs;
}
ompi_proc_t **ompi_proc_get_allocated (size_t *size)
{
ompi_proc_t **procs;
ompi_proc_t *proc;
@ -456,6 +461,55 @@ ompi_proc_t** ompi_proc_world(size_t *size)
return procs;
}
ompi_proc_t **ompi_proc_world (size_t *size)
{
ompi_proc_t **procs;
ompi_proc_t *proc;
size_t count = 0;
ompi_rte_cmp_bitmask_t mask;
ompi_process_name_t my_name;
/* check bozo case */
if (NULL == ompi_proc_local_proc) {
return NULL;
}
/* First count how many match this jobid (we already know this from our process info) */
count = ompi_process_info.num_procs;
/* allocate an array */
procs = (ompi_proc_t **) malloc (count * sizeof(ompi_proc_t*));
if (NULL == procs) {
return NULL;
}
/* now get/allocate all the procs in this jobid */
for (int i = 0 ; i < count ; ++i) {
opal_process_name_t name = {.jobid = OMPI_CAST_RTE_NAME(&ompi_proc_local_proc->super.proc_name)->jobid,
.vpid = i};
/* DO NOT RETAIN THIS OBJECT - the reference count on this
* object will be adjusted by external callers. The intent
* here is to allow the reference count to drop to zero if
* the app no longer desires to communicate with this proc.
* For example, the proc may call comm_disconnect on all
* communicators involving this proc. In such cases, we want
* the proc object to be removed from the list. By not incrementing
* the reference count here, we allow this to occur.
*
* We don't implement that yet, but we are still safe for now as
* the OBJ_NEW in ompi_proc_init owns the initial reference
* count which cannot be released until ompi_proc_finalize is
* called.
*/
procs[i] = ompi_proc_for_name (name);
}
*size = count;
return procs;
}
ompi_proc_t** ompi_proc_all(size_t* size)
{

Просмотреть файл

@ -138,7 +138,10 @@ OMPI_DECLSPEC int ompi_proc_finalize(void);
* Returns the list of proc instances associated with this job. Given
* the current association between a job and an MPI_COMM_WORLD, this
* function provides the process instances for the current
* MPI_COMM_WORLD.
* MPI_COMM_WORLD. Use this function only if absolutely needed as it
* will cause ompi_proc_t objects to be allocated for every process in
* the job. If you only need the allocated ompi_proc_t objects call
* ompi_proc_get_allocated() instead.
*
* @note The reference count of each process in the array is
* NOT incremented - the caller is responsible for ensuring the
@ -152,6 +155,36 @@ OMPI_DECLSPEC int ompi_proc_finalize(void);
*/
OMPI_DECLSPEC ompi_proc_t** ompi_proc_world(size_t* size);
/**
* Returns the number of processes in the associated with this job.
*
* Returns the list of proc instances associated with this job. Given
* the current association between a job and an MPI_COMM_WORLD, this
* function provides the number of processes for the current
* MPI_COMM_WORLD.
*/
OMPI_DECLSPEC int ompi_proc_world_size (void);
/**
* Returns the list of proc instances associated with this job.
*
* Returns the list of proc instances associated with this job that have
* already been allocated. Given the current association between a job
* and an MPI_COMM_WORLD, this function provides the allocated process
* instances for the current MPI_COMM_WORLD.
*
* @note The reference count of each process in the array is
* NOT incremented - the caller is responsible for ensuring the
* correctness of the reference count once they are done with
* the array.
*
* @param[in] size Number of processes in the ompi_proc_t array
*
* @return Array of pointers to allocated proc instances in the current
* MPI_COMM_WORLD, or NULL if there is an internal failure.
*/
OMPI_DECLSPEC ompi_proc_t **ompi_proc_get_allocated (size_t *size);
/**
* Returns the list of all known proc instances.

Просмотреть файл

@ -285,8 +285,11 @@ int ompi_mpi_finalize(void)
return ret;
}
/* call del_procs on all allocated procs even though some may not be known
* to the pml layer. the pml layer is expected to be resilient and ignore
* any unknown procs. */
nprocs = 0;
procs = ompi_proc_world(&nprocs);
procs = ompi_proc_get_allocated (&nprocs);
MCA_PML_CALL(del_procs(procs, nprocs));
free(procs);

Просмотреть файл

@ -739,9 +739,10 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
goto error;
}
/* add all ompi_proc_t's to PML */
if (NULL == (procs = ompi_proc_world(&nprocs))) {
error = "ompi_proc_world() failed";
/* add all allocated ompi_proc_t's to PML (below the add_procs limit this
* behaves identically to ompi_proc_world ()) */
if (NULL == (procs = ompi_proc_get_allocated (&nprocs))) {
error = "ompi_proc_get_allocated () failed";
goto error;
}
ret = MCA_PML_CALL(add_procs(procs, nprocs));