ompi/proc: add function to get all allocated procs
This commit adds two new functions: - ompi_proc_get_allocated - Returns all procs in the current job that have already been allocated. This is used in init/finalize to determine which procs to pass to add_procs/del_procs. - ompi_proc_world_size - returns the number of processes in MPI_COMM_WORLD. This may be removed in favor of callers just looking at ompi_process_info. The behavior of ompi_proc_world has been restored to return ompi_proc_t's for all processes in the current job. The use of this function is discouraged. Code that was using ompi_proc_world() has been updated to make use of the new functions to avoid the memory overhead of ompi_comm_world (). Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov>
Этот коммит содержится в:
родитель
db74fa9d0f
Коммит
2c89c7f47d
@ -304,27 +304,30 @@ int ompi_mtl_mxm_module_init(void)
|
||||
}
|
||||
#endif
|
||||
|
||||
if (NULL == (procs = ompi_proc_world(&totps))) {
|
||||
MXM_ERROR("Unable to obtain process list");
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
totps = ompi_proc_world_size ();
|
||||
|
||||
if (totps < (size_t)ompi_mtl_mxm.mxm_np) {
|
||||
MXM_VERBOSE(1, "MXM support will be disabled because of total number "
|
||||
"of processes (%lu) is less than the minimum set by the "
|
||||
"mtl_mxm_np MCA parameter (%u)", totps, ompi_mtl_mxm.mxm_np);
|
||||
free(procs);
|
||||
return OMPI_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
MXM_VERBOSE(1, "MXM support enabled");
|
||||
|
||||
if (ORTE_NODE_RANK_INVALID == (lr = ompi_process_info.my_node_rank)) {
|
||||
MXM_ERROR("Unable to obtain local node rank");
|
||||
free(procs);
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
nlps = ompi_process_info.num_local_peers + 1;
|
||||
|
||||
/* local procs are always allocated. if that ever changes this will need to
|
||||
* be modified. */
|
||||
procs = ompi_proc_get_allocated (&totps);
|
||||
if (NULL == procs) {
|
||||
MXM_ERROR("Unable to obtain process list");
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
for (proc = 0; proc < totps; proc++) {
|
||||
if (OPAL_PROC_ON_LOCAL_NODE(procs[proc]->super.proc_flags)) {
|
||||
mxlr = max(mxlr, procs[proc]->super.proc_name.vpid);
|
||||
@ -595,14 +598,8 @@ int ompi_mtl_mxm_del_procs(struct mca_mtl_base_module_t *mtl, size_t nprocs,
|
||||
size_t i;
|
||||
|
||||
#if MXM_API >= MXM_VERSION(3,1)
|
||||
if (ompi_mtl_mxm.bulk_disconnect) {
|
||||
size_t nprocs_world;
|
||||
ompi_proc_t **procs;
|
||||
procs = ompi_proc_world(&nprocs_world);
|
||||
if (nprocs == nprocs_world) {
|
||||
mxm_ep_powerdown(ompi_mtl_mxm.ep);
|
||||
}
|
||||
free(procs);
|
||||
if (ompi_mtl_mxm.bulk_disconnect && nprocs == ompi_proc_world_size ()) {
|
||||
mxm_ep_powerdown(ompi_mtl_mxm.ep);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -399,7 +399,12 @@ int ompi_proc_finalize (void)
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
ompi_proc_t** ompi_proc_world(size_t *size)
|
||||
int ompi_proc_world_size (void)
|
||||
{
|
||||
return ompi_process_info.num_procs;
|
||||
}
|
||||
|
||||
ompi_proc_t **ompi_proc_get_allocated (size_t *size)
|
||||
{
|
||||
ompi_proc_t **procs;
|
||||
ompi_proc_t *proc;
|
||||
@ -456,6 +461,55 @@ ompi_proc_t** ompi_proc_world(size_t *size)
|
||||
return procs;
|
||||
}
|
||||
|
||||
ompi_proc_t **ompi_proc_world (size_t *size)
|
||||
{
|
||||
ompi_proc_t **procs;
|
||||
ompi_proc_t *proc;
|
||||
size_t count = 0;
|
||||
ompi_rte_cmp_bitmask_t mask;
|
||||
ompi_process_name_t my_name;
|
||||
|
||||
/* check bozo case */
|
||||
if (NULL == ompi_proc_local_proc) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* First count how many match this jobid (we already know this from our process info) */
|
||||
count = ompi_process_info.num_procs;
|
||||
|
||||
/* allocate an array */
|
||||
procs = (ompi_proc_t **) malloc (count * sizeof(ompi_proc_t*));
|
||||
if (NULL == procs) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* now get/allocate all the procs in this jobid */
|
||||
for (int i = 0 ; i < count ; ++i) {
|
||||
opal_process_name_t name = {.jobid = OMPI_CAST_RTE_NAME(&ompi_proc_local_proc->super.proc_name)->jobid,
|
||||
.vpid = i};
|
||||
|
||||
/* DO NOT RETAIN THIS OBJECT - the reference count on this
|
||||
* object will be adjusted by external callers. The intent
|
||||
* here is to allow the reference count to drop to zero if
|
||||
* the app no longer desires to communicate with this proc.
|
||||
* For example, the proc may call comm_disconnect on all
|
||||
* communicators involving this proc. In such cases, we want
|
||||
* the proc object to be removed from the list. By not incrementing
|
||||
* the reference count here, we allow this to occur.
|
||||
*
|
||||
* We don't implement that yet, but we are still safe for now as
|
||||
* the OBJ_NEW in ompi_proc_init owns the initial reference
|
||||
* count which cannot be released until ompi_proc_finalize is
|
||||
* called.
|
||||
*/
|
||||
procs[i] = ompi_proc_for_name (name);
|
||||
}
|
||||
|
||||
*size = count;
|
||||
|
||||
return procs;
|
||||
}
|
||||
|
||||
|
||||
ompi_proc_t** ompi_proc_all(size_t* size)
|
||||
{
|
||||
|
@ -138,7 +138,10 @@ OMPI_DECLSPEC int ompi_proc_finalize(void);
|
||||
* Returns the list of proc instances associated with this job. Given
|
||||
* the current association between a job and an MPI_COMM_WORLD, this
|
||||
* function provides the process instances for the current
|
||||
* MPI_COMM_WORLD.
|
||||
* MPI_COMM_WORLD. Use this function only if absolutely needed as it
|
||||
* will cause ompi_proc_t objects to be allocated for every process in
|
||||
* the job. If you only need the allocated ompi_proc_t objects call
|
||||
* ompi_proc_get_allocated() instead.
|
||||
*
|
||||
* @note The reference count of each process in the array is
|
||||
* NOT incremented - the caller is responsible for ensuring the
|
||||
@ -152,6 +155,36 @@ OMPI_DECLSPEC int ompi_proc_finalize(void);
|
||||
*/
|
||||
OMPI_DECLSPEC ompi_proc_t** ompi_proc_world(size_t* size);
|
||||
|
||||
/**
|
||||
* Returns the number of processes in the associated with this job.
|
||||
*
|
||||
* Returns the list of proc instances associated with this job. Given
|
||||
* the current association between a job and an MPI_COMM_WORLD, this
|
||||
* function provides the number of processes for the current
|
||||
* MPI_COMM_WORLD.
|
||||
*/
|
||||
|
||||
OMPI_DECLSPEC int ompi_proc_world_size (void);
|
||||
|
||||
/**
|
||||
* Returns the list of proc instances associated with this job.
|
||||
*
|
||||
* Returns the list of proc instances associated with this job that have
|
||||
* already been allocated. Given the current association between a job
|
||||
* and an MPI_COMM_WORLD, this function provides the allocated process
|
||||
* instances for the current MPI_COMM_WORLD.
|
||||
*
|
||||
* @note The reference count of each process in the array is
|
||||
* NOT incremented - the caller is responsible for ensuring the
|
||||
* correctness of the reference count once they are done with
|
||||
* the array.
|
||||
*
|
||||
* @param[in] size Number of processes in the ompi_proc_t array
|
||||
*
|
||||
* @return Array of pointers to allocated proc instances in the current
|
||||
* MPI_COMM_WORLD, or NULL if there is an internal failure.
|
||||
*/
|
||||
OMPI_DECLSPEC ompi_proc_t **ompi_proc_get_allocated (size_t *size);
|
||||
|
||||
/**
|
||||
* Returns the list of all known proc instances.
|
||||
|
@ -285,8 +285,11 @@ int ompi_mpi_finalize(void)
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* call del_procs on all allocated procs even though some may not be known
|
||||
* to the pml layer. the pml layer is expected to be resilient and ignore
|
||||
* any unknown procs. */
|
||||
nprocs = 0;
|
||||
procs = ompi_proc_world(&nprocs);
|
||||
procs = ompi_proc_get_allocated (&nprocs);
|
||||
MCA_PML_CALL(del_procs(procs, nprocs));
|
||||
free(procs);
|
||||
|
||||
|
@ -739,9 +739,10 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
||||
goto error;
|
||||
}
|
||||
|
||||
/* add all ompi_proc_t's to PML */
|
||||
if (NULL == (procs = ompi_proc_world(&nprocs))) {
|
||||
error = "ompi_proc_world() failed";
|
||||
/* add all allocated ompi_proc_t's to PML (below the add_procs limit this
|
||||
* behaves identically to ompi_proc_world ()) */
|
||||
if (NULL == (procs = ompi_proc_get_allocated (&nprocs))) {
|
||||
error = "ompi_proc_get_allocated () failed";
|
||||
goto error;
|
||||
}
|
||||
ret = MCA_PML_CALL(add_procs(procs, nprocs));
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user