5f1254d710
Use of the old ompi_free_list_t and ompi_free_list_item_t is deprecated. These classes will be removed in a future commit. This commit updates the entire code base to use opal_free_list_t and opal_free_list_item_t. Notes: OMPI_FREE_LIST_*_MT -> opal_free_list_* (uses opal_using_threads ()) Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov>
553 строки
21 KiB
C
553 строки
21 KiB
C
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
|
/*
|
|
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
|
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
|
* Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights
|
|
* reserved.
|
|
* Copyright (c) 2014 Research Organization for Information Science
|
|
* and Technology (RIST). All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
#ifndef MCA_COLL_ML_COLLS_H
|
|
#define MCA_COLL_ML_COLLS_H
|
|
|
|
#include "ompi_config.h"
|
|
#include "ompi/mca/bcol/bcol.h"
|
|
|
|
#define COLL_ML_FN_NAME_LEN 256
|
|
|
|
|
|
/* utility information used to coordinate activities, such as resource
|
|
* management between different functions in the hierarchy
|
|
*/
|
|
struct mca_coll_ml_utility_data_t {
|
|
|
|
/* RLG - temp fix !!!! - really need to remove this, but right now
|
|
do not want to change the signature of the collective primitives to
|
|
use coll_ml_utility_data_t rather than mca_bcol_base_function_t */
|
|
int dummy;
|
|
|
|
/* module */
|
|
struct mca_bcol_base_module_t *bcol_module;
|
|
|
|
/* */
|
|
int index_in_consecutive_same_bcol_calls;
|
|
|
|
/* number of times functions from this bcol are called in order */
|
|
int n_of_this_type_in_a_row;
|
|
|
|
/* number of times functions from this module are called
|
|
* in the collective operation. */
|
|
int n_of_this_type_in_collective;
|
|
int index_of_this_type_in_collective;
|
|
|
|
};
|
|
typedef struct mca_coll_ml_utility_data_t mca_coll_ml_utility_data_t;
|
|
|
|
|
|
/* forward declaration */
|
|
struct mca_coll_ml_collective_operation_progress_t;
|
|
struct mca_coll_ml_task_status_t;
|
|
|
|
typedef int (* mca_coll_ml_process_op_fn_t)
|
|
(struct mca_coll_ml_collective_operation_progress_t *coll_op);
|
|
|
|
typedef int (* mca_coll_ml_task_comp_fn_t)
|
|
(struct mca_coll_ml_task_status_t *coll_op);
|
|
|
|
typedef int (* mca_coll_ml_fragment_launch_fn_t)
|
|
( struct mca_coll_ml_collective_operation_progress_t *coll_op);
|
|
|
|
typedef int (* mca_coll_ml_sequential_task_setup_fn_t)
|
|
( struct mca_coll_ml_collective_operation_progress_t *coll_op);
|
|
/* This data structure defines the dependencies for a given
|
|
* compound operation. We will use this as a basis for implementing
|
|
* collective operations.
|
|
*/
|
|
struct mca_coll_ml_compound_functions_t {
|
|
/* label */
|
|
char fn_name[COLL_ML_FN_NAME_LEN];
|
|
|
|
/* hierarchy level that is used for this bcol */
|
|
int h_level;
|
|
|
|
/* the list of functions that make up this task */
|
|
/* coll_bcol_collective_description_t *bcol_function; */
|
|
mca_bcol_base_coll_fn_desc_t *bcol_function;
|
|
/* task completion function for this compound function */
|
|
mca_coll_ml_task_comp_fn_t task_comp_fn;
|
|
|
|
/* module specific information that is a constant on a per group
|
|
* basis
|
|
*/
|
|
mca_coll_ml_utility_data_t constant_group_data;
|
|
|
|
/* number of dependencies to be satified before these function can be
|
|
* started */
|
|
int num_dependencies;
|
|
|
|
/*
|
|
* number of notifications to perform on completion. The assumption
|
|
* is that a counter will be incremented.
|
|
*/
|
|
int num_dependent_tasks;
|
|
|
|
/*
|
|
* pointers to counters that need be updated. This assumes
|
|
* an array of tasks is used to describe the ML level
|
|
* collective operation, with these indecies referencing elements
|
|
* in this array.
|
|
*/
|
|
int *dependent_task_indices;
|
|
|
|
};
|
|
|
|
typedef struct mca_coll_ml_compound_functions_t mca_coll_ml_compound_functions_t;
|
|
|
|
/* Forward declaration for operation_description_t */
|
|
struct mca_coll_ml_module_t;
|
|
|
|
enum {
|
|
COLL_ML_GENERAL_TASK_FN,
|
|
COLL_ML_ROOT_TASK_FN,
|
|
COLL_ML_MAX_TASK_FN
|
|
};
|
|
|
|
enum {
|
|
SEQ_TASK_NOT_STARTED,
|
|
SEQ_TASK_PENDING,
|
|
SEQ_TASK_IN_PROG
|
|
};
|
|
|
|
typedef void (*mca_coll_ml_task_setup_fn_t) (struct mca_coll_ml_task_status_t *task_status, int index, struct mca_coll_ml_compound_functions_t *func);
|
|
|
|
/*
|
|
* Collective operation definition
|
|
*/
|
|
struct mca_coll_ml_collective_operation_description_t {
|
|
|
|
/*
|
|
* Type of collective opeartion - there are two types:
|
|
* 1) sequential progress through the collectives is sufficient
|
|
* 2) general treatment, popping tasks onto execution queus is needed.
|
|
*/
|
|
int progress_type;
|
|
|
|
struct mca_coll_ml_topology_t *topo_info;
|
|
|
|
/*
|
|
* number of functions in collective operation
|
|
*/
|
|
int n_fns;
|
|
|
|
/*
|
|
* list of functions
|
|
*/
|
|
mca_coll_ml_compound_functions_t *component_functions;
|
|
|
|
/*
|
|
* array of lists of functions
|
|
*/
|
|
mca_coll_ml_compound_functions_t **comp_fn_arr;
|
|
|
|
/*
|
|
* indices into the list - fixes a sequential schedule
|
|
*/
|
|
int *sch_idx;
|
|
|
|
/*
|
|
* Task setup functions, so far we have only 3 - root and non-root
|
|
*/
|
|
mca_coll_ml_task_setup_fn_t task_setup_fn[COLL_ML_MAX_TASK_FN];
|
|
|
|
/* number of functions are called for bcols need ordering */
|
|
int n_fns_need_ordering;
|
|
};
|
|
typedef struct mca_coll_ml_collective_operation_description_t
|
|
mca_coll_ml_collective_operation_description_t;
|
|
|
|
/* Data structure used to track the state of individual bcol
|
|
* functions. This is used to track dependencies and completion
|
|
* to progress the ML level function correctly.
|
|
*
|
|
* mca_coll_ml_task_status_t will be associated with an
|
|
* mca_coll_ml_collective_operation_progress_t structure for
|
|
* the duration of the lifetime of a communicator.
|
|
* An array of task statuses will be stored with
|
|
* the mca_coll_ml_collective_operation_progress_t data structure, so
|
|
* that the taks status elements do not need to be moved back to
|
|
* a free list before they are re-used. When the ML level function
|
|
* is complete, all mca_coll_ml_task_status_t are available for
|
|
* re-use.
|
|
*/
|
|
struct mca_coll_ml_task_status_t{
|
|
/* need to move this between lists to progress this correctly */
|
|
opal_list_item_t item;
|
|
|
|
/* number of dependencies satisfied */
|
|
int n_dep_satisfied;
|
|
|
|
/* ***************************************************************
|
|
* Pasha:
|
|
* I'm adding to the status: num_dependencies, num_dependent_tasks and
|
|
* dependent_task_indices. The information originally resided on mca_coll_ml_compound_functions_t.
|
|
* For collective operation with static nature it is not problem.
|
|
* But for Bcast operation, where run time parameters, like root, actually
|
|
* define the dependency. rt prefix mean run-time.
|
|
*/
|
|
|
|
/* number of dependencies to be satisfied before these function can be
|
|
* started */
|
|
int rt_num_dependencies;
|
|
|
|
/*
|
|
* number of notifications to perform on completion. The assumption
|
|
* is that a counter will be incremented.
|
|
*/
|
|
int rt_num_dependent_tasks;
|
|
|
|
/*
|
|
* pointers to counters that need be updated. This assumes
|
|
* an array of tasks is used to describe the ML level
|
|
* collective operation, with these indecies referencing elements
|
|
* in this array.
|
|
*/
|
|
int *rt_dependent_task_indices;
|
|
/*
|
|
*
|
|
* ***************************************************************/
|
|
|
|
/* index in collective schedule */
|
|
int my_index_in_coll_schedule;
|
|
|
|
/* function pointers */
|
|
mca_bcol_base_coll_fn_desc_t *bcol_fn;
|
|
|
|
/* association with a specific collective task - the ML
|
|
* mca_coll_ml_collective_operation_progress_t stores the
|
|
* specific function parameters */
|
|
struct mca_coll_ml_collective_operation_progress_t *ml_coll_operation;
|
|
|
|
mca_coll_ml_task_comp_fn_t task_comp_fn;
|
|
};
|
|
typedef struct mca_coll_ml_task_status_t mca_coll_ml_task_status_t;
|
|
|
|
typedef enum mca_coll_ml_pending_type_t {
|
|
REQ_OUT_OF_ORDER = 1,
|
|
REQ_OUT_OF_MEMORY = 1 << 1
|
|
} mca_coll_ml_pending_type_t;
|
|
|
|
/* Forward declaration */
|
|
struct mca_bcol_base_payload_buffer_desc_t;
|
|
/* Data structure used to track ML level collective operation
|
|
* progress.
|
|
*/
|
|
struct mca_coll_ml_collective_operation_progress_t {
|
|
/* need this to put on a list properly */
|
|
/* Full message information */
|
|
struct full_message_t {
|
|
/* make this a list item */
|
|
ompi_request_t super;
|
|
/* Next expected fragment.
|
|
* It used for controling order of converter unpack operation */
|
|
size_t next_expected_index;
|
|
/* Pointer to last intilized fragment.
|
|
* It used for controling order of converter unpack operation */
|
|
struct mca_coll_ml_collective_operation_progress_t *last_started_frag;
|
|
/* destination data address in user memory */
|
|
void *dest_user_addr;
|
|
/* source data address in user memory */
|
|
void *src_user_addr;
|
|
/* total message size */
|
|
size_t n_bytes_total;
|
|
/* per-process total message size - relevant for operations
|
|
* such as gather and scatter, where each rank has it's
|
|
* own unique data
|
|
*/
|
|
size_t n_bytes_per_proc_total;
|
|
size_t max_n_bytes_per_proc_total;
|
|
/* data processes - from a local perspective */
|
|
size_t n_bytes_delivered;
|
|
/* current offset - where to continue with next fragment */
|
|
size_t n_bytes_scheduled;
|
|
/* number of fragments needed to process this message */
|
|
size_t n_fragments;
|
|
/* number of active frags */
|
|
int n_active;
|
|
/* actual pipeline depth */
|
|
int pipeline_depth;
|
|
/* am I the real root of the collective ? */
|
|
bool root;
|
|
/* collective fragment launcher */
|
|
mca_coll_ml_fragment_launch_fn_t fragment_launcher;
|
|
/* is data contingous */
|
|
bool send_data_continguous;
|
|
bool recv_data_continguous;
|
|
/* data type count */
|
|
int64_t send_count;
|
|
int64_t recv_count;
|
|
/* extent of the data types */
|
|
size_t send_extent;
|
|
size_t recv_extent;
|
|
/* send data type */
|
|
struct ompi_datatype_t * send_data_type;
|
|
/* needed for non-contigous buffers */
|
|
size_t offset_into_send_buffer;
|
|
/* receive data type */
|
|
struct ompi_datatype_t * recv_data_type;
|
|
/* needed for non-contigous buffers */
|
|
size_t offset_into_recv_buffer;
|
|
/* Convertors for non contigous data */
|
|
opal_convertor_t send_convertor;
|
|
opal_convertor_t recv_convertor;
|
|
/* Will be used by receiver for #bytes calc in the next frag */
|
|
opal_convertor_t dummy_convertor;
|
|
size_t dummy_conv_position;
|
|
/* Size of packed data */
|
|
size_t send_converter_bytes_packed;
|
|
size_t recv_converter_bytes_packed;
|
|
/* In case if ordering is needed: order num for next frag */
|
|
int next_frag_num;
|
|
/* The variable is used by non-blocking memory synchronization code
|
|
* for caching bank index */
|
|
int bank_index_to_recycle;
|
|
/* need a handle for collective progress e.g. alltoall*/
|
|
bcol_fragment_descriptor_t frag_info;
|
|
} full_message;
|
|
|
|
/* collective operation being progressed */
|
|
mca_coll_ml_collective_operation_description_t *coll_schedule;
|
|
/* */
|
|
mca_coll_ml_process_op_fn_t process_fn;
|
|
|
|
mca_coll_base_module_t *coll_module;
|
|
|
|
/* If not null , we have to release next fragment */
|
|
struct mca_coll_ml_collective_operation_progress_t *next_to_process_frag;
|
|
/* pointer to previous fragment */
|
|
struct mca_coll_ml_collective_operation_progress_t *prev_frag;
|
|
/* This flag marks that the fragment is pending on the waiting
|
|
* to be processed prior to recycling
|
|
*/
|
|
enum mca_coll_ml_pending_type_t pending;
|
|
|
|
/* Fragment data */
|
|
struct fragment_data_t {
|
|
/* current buffer pointer - offset (in bytes) into the user data */
|
|
size_t offset_into_user_buffer;
|
|
size_t offset_into_user_buffer_per_proc;
|
|
|
|
/* amount of data (in bytes) in this fragment - amount of data
|
|
* actually processed */
|
|
size_t fragment_size;
|
|
size_t per_rank_fragment_size;
|
|
size_t data_type_count_per_frag;
|
|
|
|
/* pointer to full message progress data */
|
|
struct full_message_t *message_descriptor;
|
|
|
|
/* ML buffer descriptor attached to this buffer */
|
|
struct mca_bcol_base_payload_buffer_desc_t *buffer_desc;
|
|
/* handle for collective progress, e.g. alltoall */
|
|
bcol_fragment_descriptor_t bcol_fragment_desc;
|
|
|
|
/* Which collective algorithm */
|
|
int current_coll_op;
|
|
} fragment_data;
|
|
|
|
/* specific function parameters */
|
|
/* the assumption is that the variable parameters passed into
|
|
* the ML level function will persist until the collective operation
|
|
* is complete. For a blocking function this is until the collective
|
|
* function is exited, and for nonblocking collective functions this
|
|
* is until test or wait completes the collective.
|
|
*/
|
|
int global_root;
|
|
bcol_function_args_t variable_fn_params;
|
|
|
|
struct{
|
|
/* current active function - for sequential algorithms */
|
|
int current_active_bcol_fn;
|
|
|
|
/* current function status - not started, or in progress.
|
|
* When the routine has completed, the active bcol index is
|
|
* incremented, so no need to keep track of a completed
|
|
* status.
|
|
*/
|
|
int current_bcol_status;
|
|
|
|
/* use this call back to setup algorithm specific info
|
|
after each level necessary
|
|
*/
|
|
mca_coll_ml_sequential_task_setup_fn_t seq_task_setup;
|
|
|
|
} sequential_routine;
|
|
|
|
struct{
|
|
/*
|
|
* BCOL function status - individual elements will be posted to
|
|
* ml level component queues, as appropriate.
|
|
*/
|
|
mca_coll_ml_task_status_t *status_array;
|
|
|
|
/* number of completed tasks - need this for collective completion.
|
|
* Resource completion is tracked by each BCOL module .
|
|
*/
|
|
int num_tasks_completed;
|
|
} dag_description;
|
|
};
|
|
typedef struct mca_coll_ml_collective_operation_progress_t
|
|
mca_coll_ml_collective_operation_progress_t;
|
|
OBJ_CLASS_DECLARATION(mca_coll_ml_collective_operation_progress_t);
|
|
|
|
#define OP_ML_MODULE(op) ((mca_coll_ml_module_t *)((op)->coll_module))
|
|
#define GET_COMM(op) ((OP_ML_MODULE(op))->comm)
|
|
#define IS_COLL_SYNCMEM(op) (ML_MEMSYNC == op->fragment_data.current_coll_op)
|
|
|
|
#define CHECK_AND_RECYCLE(op) \
|
|
do { \
|
|
if (0 == (op)->pending) { \
|
|
/* Caching 2 values that we can't to touch on op after returing it */ \
|
|
/* back to the free list (free list may release memory on distruct )*/ \
|
|
struct ompi_communicator_t *comm = GET_COMM(op); \
|
|
bool is_coll_sync = IS_COLL_SYNCMEM(op); \
|
|
ML_VERBOSE(10, ("Releasing %p", op)); \
|
|
OMPI_REQUEST_FINI(&(op)->full_message.super); \
|
|
opal_free_list_return (&(((mca_coll_ml_module_t *)(op)->coll_module)-> \
|
|
coll_ml_collective_descriptors), \
|
|
(opal_free_list_item_t *)op); \
|
|
/* Special check for memory synchronization completion */ \
|
|
/* We have to return it first to free list, since the communicator */ \
|
|
/* release potentially may trigger ML module distraction and having */ \
|
|
/* the element not on the list may cause memory leak. */ \
|
|
if (OPAL_UNLIKELY(is_coll_sync)) { \
|
|
if (OMPI_COMM_IS_INTRINSIC(comm)) { \
|
|
opal_show_help("help-mpi-coll-ml.txt", \
|
|
"coll-ml-check-fatal-error", true, \
|
|
comm->c_name); \
|
|
ompi_mpi_abort(comm, 6); \
|
|
} else { \
|
|
opal_show_help("help-mpi-coll-ml.txt", \
|
|
"coll-ml-check-error", true, \
|
|
comm->c_name); \
|
|
/* After this point it is UNSAFE to touch ml module */ \
|
|
/* or communicator */ \
|
|
OBJ_RELEASE(comm); \
|
|
} \
|
|
} \
|
|
} \
|
|
} while (0)
|
|
|
|
#define MCA_COLL_ML_SET_ORDER_INFO(coll_progress, num_frags) \
|
|
do { \
|
|
mca_coll_ml_topology_t *topo = (coll_progress)->coll_schedule->topo_info; \
|
|
bcol_function_args_t *variable_params = &(coll_progress)->variable_fn_params; \
|
|
if (topo->topo_ordering_info.num_bcols_need_ordering > 0) { \
|
|
variable_params->order_info.bcols_started = 0; \
|
|
variable_params->order_info.order_num = \
|
|
topo->topo_ordering_info.next_order_num; \
|
|
variable_params->order_info.n_fns_need_ordering = \
|
|
(coll_progress)->coll_schedule->n_fns_need_ordering; \
|
|
topo->topo_ordering_info.next_order_num += num_frags; \
|
|
(coll_progress)->fragment_data.message_descriptor->next_frag_num = \
|
|
variable_params->order_info.order_num + 1; \
|
|
} \
|
|
} while (0)
|
|
|
|
#define MCA_COLL_ML_SET_NEW_FRAG_ORDER_INFO(coll_progress) \
|
|
do { \
|
|
mca_coll_ml_topology_t *topo = (coll_progress)->coll_schedule->topo_info; \
|
|
if (topo->topo_ordering_info.num_bcols_need_ordering > 0) { \
|
|
bcol_function_args_t *variable_params = &(coll_progress)->variable_fn_params; \
|
|
struct fragment_data_t *frag_data = &(coll_progress)->fragment_data; \
|
|
variable_params->order_info.bcols_started = 0; \
|
|
variable_params->order_info.order_num = frag_data->message_descriptor->next_frag_num; \
|
|
variable_params->order_info.n_fns_need_ordering = \
|
|
(coll_progress)->coll_schedule->n_fns_need_ordering; \
|
|
frag_data->message_descriptor->next_frag_num++; \
|
|
} \
|
|
} while (0)
|
|
|
|
#define MCA_COLL_ML_SET_SCHEDULE_ORDER_INFO(schedule) \
|
|
do { \
|
|
int i; \
|
|
(schedule)->n_fns_need_ordering = 0; \
|
|
for (i = 0; i < (schedule)->n_fns; ++i) { \
|
|
mca_bcol_base_module_t *current_bcol = \
|
|
(schedule)->component_functions[i].constant_group_data.bcol_module; \
|
|
assert (NULL != current_bcol); \
|
|
if (current_bcol->bcol_component->need_ordering) { \
|
|
(schedule)->n_fns_need_ordering++; \
|
|
} \
|
|
} \
|
|
} while (0)
|
|
|
|
enum {
|
|
MCA_COLL_ML_NET_STREAM_SEND,
|
|
MCA_COLL_ML_NET_STREAM_RECV
|
|
};
|
|
|
|
static inline __opal_attribute_always_inline__
|
|
int mca_coll_ml_convertor_prepare(ompi_datatype_t *dtype, int count, void *buff,
|
|
opal_convertor_t *convertor, int stream)
|
|
{
|
|
size_t bytes_packed;
|
|
|
|
if (MCA_COLL_ML_NET_STREAM_SEND == stream) {
|
|
opal_convertor_copy_and_prepare_for_send(
|
|
ompi_mpi_local_convertor,
|
|
&dtype->super, count, buff, 0,
|
|
convertor);
|
|
} else {
|
|
opal_convertor_copy_and_prepare_for_recv(
|
|
ompi_mpi_local_convertor,
|
|
&dtype->super, count, buff, 0,
|
|
convertor);
|
|
}
|
|
|
|
opal_convertor_get_packed_size(convertor, &bytes_packed);
|
|
|
|
return bytes_packed;
|
|
}
|
|
|
|
static inline __opal_attribute_always_inline__
|
|
int mca_coll_ml_convertor_pack(void *data_addr, size_t buff_size,
|
|
opal_convertor_t *convertor)
|
|
{
|
|
struct iovec iov;
|
|
|
|
size_t max_data = 0;
|
|
uint32_t iov_count = 1;
|
|
|
|
iov.iov_base = (IOVBASE_TYPE*) data_addr;
|
|
iov.iov_len = buff_size;
|
|
|
|
opal_convertor_pack(convertor, &iov, &iov_count, &max_data);
|
|
|
|
return max_data;
|
|
}
|
|
|
|
static inline __opal_attribute_always_inline__
|
|
int mca_coll_ml_convertor_unpack(void *data_addr, size_t buff_size,
|
|
opal_convertor_t *convertor)
|
|
{
|
|
struct iovec iov;
|
|
|
|
size_t max_data = 0;
|
|
uint32_t iov_count = 1;
|
|
|
|
iov.iov_base = (void *) (uintptr_t) data_addr;
|
|
iov.iov_len = buff_size;
|
|
|
|
opal_convertor_unpack(convertor, &iov, &iov_count, &max_data);
|
|
|
|
return max_data;
|
|
}
|
|
#endif /* MCA_COLL_ML_COLLS_H */
|
|
|