Allow partial use of allocations by specifying the max number of daemons (i.e., max VM size) for the job
This commit was SVN r26499.
Этот коммит содержится в:
родитель
c69a04e16b
Коммит
be6ed9c2df
@ -1006,6 +1006,7 @@ int orte_plm_base_setup_virtual_machine(orte_job_t *jdata)
|
|||||||
opal_list_item_t *item, *next;
|
opal_list_item_t *item, *next;
|
||||||
orte_app_context_t *app;
|
orte_app_context_t *app;
|
||||||
bool one_filter = false;
|
bool one_filter = false;
|
||||||
|
int num_nodes;
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||||
"%s plm:base:setup_vm",
|
"%s plm:base:setup_vm",
|
||||||
@ -1146,17 +1147,34 @@ int orte_plm_base_setup_virtual_machine(orte_job_t *jdata)
|
|||||||
|
|
||||||
/* cycle thru all available nodes and find those that do not already
|
/* cycle thru all available nodes and find those that do not already
|
||||||
* have a daemon on them - no need to include our own as we are
|
* have a daemon on them - no need to include our own as we are
|
||||||
* obviously already here!
|
* obviously already here! If a max vm size was given, then limit
|
||||||
|
* the overall number of active nodes to the given number. Only
|
||||||
|
* count the HNP's node if it was included in the allocation
|
||||||
*/
|
*/
|
||||||
|
if (orte_hnp_is_allocated) {
|
||||||
|
num_nodes = 1;
|
||||||
|
} else {
|
||||||
|
num_nodes = 0;
|
||||||
|
}
|
||||||
while (NULL != (item = opal_list_remove_first(&nodes))) {
|
while (NULL != (item = opal_list_remove_first(&nodes))) {
|
||||||
|
/* if a max size was given and we are there, then exit the loop */
|
||||||
|
if (0 < orte_max_vm_size && num_nodes == orte_max_vm_size) {
|
||||||
|
/* maintain accounting */
|
||||||
|
OBJ_RELEASE(item);
|
||||||
|
break;
|
||||||
|
}
|
||||||
node = (orte_node_t*)item;
|
node = (orte_node_t*)item;
|
||||||
/* if this node is already in the map, skip it */
|
/* if this node is already in the map, skip it */
|
||||||
if (NULL != node->daemon) {
|
if (NULL != node->daemon) {
|
||||||
|
num_nodes++;
|
||||||
|
/* maintain accounting */
|
||||||
|
OBJ_RELEASE(item);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
/* add the node to the map */
|
/* add the node to the map */
|
||||||
opal_pointer_array_add(map->nodes, (void*)node);
|
opal_pointer_array_add(map->nodes, (void*)node);
|
||||||
++(map->num_nodes);
|
++(map->num_nodes);
|
||||||
|
num_nodes++;
|
||||||
/* create a new daemon object for this node */
|
/* create a new daemon object for this node */
|
||||||
proc = OBJ_NEW(orte_proc_t);
|
proc = OBJ_NEW(orte_proc_t);
|
||||||
if (NULL == proc) {
|
if (NULL == proc) {
|
||||||
|
@ -187,6 +187,9 @@ bool orte_map_reduce = false;
|
|||||||
/* map stddiag output to stderr so it isn't forwarded to mpirun */
|
/* map stddiag output to stderr so it isn't forwarded to mpirun */
|
||||||
bool orte_map_stddiag_to_stderr = false;
|
bool orte_map_stddiag_to_stderr = false;
|
||||||
|
|
||||||
|
/* maximum size of virtual machine - used to subdivide allocation */
|
||||||
|
int orte_max_vm_size = -1;
|
||||||
|
|
||||||
/* progress thread */
|
/* progress thread */
|
||||||
#if ORTE_ENABLE_PROGRESS_THREADS
|
#if ORTE_ENABLE_PROGRESS_THREADS
|
||||||
opal_thread_t orte_progress_thread;
|
opal_thread_t orte_progress_thread;
|
||||||
|
@ -714,6 +714,9 @@ ORTE_DECLSPEC extern bool orte_map_reduce;
|
|||||||
/* map stddiag output to stderr so it isn't forwarded to mpirun */
|
/* map stddiag output to stderr so it isn't forwarded to mpirun */
|
||||||
ORTE_DECLSPEC extern bool orte_map_stddiag_to_stderr;
|
ORTE_DECLSPEC extern bool orte_map_stddiag_to_stderr;
|
||||||
|
|
||||||
|
/* maximum size of virtual machine - used to subdivide allocation */
|
||||||
|
ORTE_DECLSPEC extern int orte_max_vm_size;
|
||||||
|
|
||||||
#endif /* ORTE_DISABLE_FULL_SUPPORT */
|
#endif /* ORTE_DISABLE_FULL_SUPPORT */
|
||||||
|
|
||||||
END_C_DECLS
|
END_C_DECLS
|
||||||
|
@ -543,6 +543,10 @@ int orte_register_params(void)
|
|||||||
false, false, (int)false, &value);
|
false, false, (int)false, &value);
|
||||||
orte_preload_binaries = OPAL_INT_TO_BOOL(value);
|
orte_preload_binaries = OPAL_INT_TO_BOOL(value);
|
||||||
|
|
||||||
|
mca_base_param_reg_int_name("orte", "max_vm_size",
|
||||||
|
"Maximum size of virtual machine - used to subdivide allocation",
|
||||||
|
false, false, -1, &orte_max_vm_size);
|
||||||
|
|
||||||
#endif /* ORTE_DISABLE_FULL_SUPPORT */
|
#endif /* ORTE_DISABLE_FULL_SUPPORT */
|
||||||
|
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
|
@ -256,6 +256,11 @@ static opal_cmd_line_init_t cmd_line_init[] = {
|
|||||||
&orterun_globals.num_procs, OPAL_CMD_LINE_TYPE_INT,
|
&orterun_globals.num_procs, OPAL_CMD_LINE_TYPE_INT,
|
||||||
"Number of processes to run" },
|
"Number of processes to run" },
|
||||||
|
|
||||||
|
/* maximum size of VM - typically used to subdivide an allocation */
|
||||||
|
{ "orte", "max", "vm_size", '\0', "max-vm-size", "max-vm-size", 1,
|
||||||
|
NULL, OPAL_CMD_LINE_TYPE_INT,
|
||||||
|
"Number of processes to run" },
|
||||||
|
|
||||||
/* Set a hostfile */
|
/* Set a hostfile */
|
||||||
{ NULL, NULL, NULL, '\0', "hostfile", "hostfile", 1,
|
{ NULL, NULL, NULL, '\0', "hostfile", "hostfile", 1,
|
||||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||||
|
@ -529,7 +529,7 @@ int orte_util_decode_nodemap(opal_byte_object_t *bo)
|
|||||||
int orte_util_decode_daemon_nodemap(opal_byte_object_t *bo)
|
int orte_util_decode_daemon_nodemap(opal_byte_object_t *bo)
|
||||||
{
|
{
|
||||||
int n;
|
int n;
|
||||||
int32_t num_nodes, i, num_daemons;
|
int32_t num_nodes, i;
|
||||||
orte_vpid_t *vpids;
|
orte_vpid_t *vpids;
|
||||||
orte_node_t *node;
|
orte_node_t *node;
|
||||||
opal_buffer_t buf;
|
opal_buffer_t buf;
|
||||||
@ -604,9 +604,12 @@ int orte_util_decode_daemon_nodemap(opal_byte_object_t *bo)
|
|||||||
/* transfer the data to the nodes, counting the number of
|
/* transfer the data to the nodes, counting the number of
|
||||||
* daemons in the system
|
* daemons in the system
|
||||||
*/
|
*/
|
||||||
num_daemons = 0;
|
|
||||||
daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
|
daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
|
||||||
for (i=0; i < num_nodes; i++) {
|
for (i=0; i < num_nodes; i++) {
|
||||||
|
if (ORTE_VPID_INVALID == vpids[i]) {
|
||||||
|
/* no daemon on this node */
|
||||||
|
continue;
|
||||||
|
}
|
||||||
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
|
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
|
||||||
/* this is an error */
|
/* this is an error */
|
||||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||||
@ -618,6 +621,7 @@ int orte_util_decode_daemon_nodemap(opal_byte_object_t *bo)
|
|||||||
dptr->name.jobid = ORTE_PROC_MY_NAME->jobid;
|
dptr->name.jobid = ORTE_PROC_MY_NAME->jobid;
|
||||||
dptr->name.vpid = vpids[i];
|
dptr->name.vpid = vpids[i];
|
||||||
opal_pointer_array_set_item(daemons->procs, vpids[i], dptr);
|
opal_pointer_array_set_item(daemons->procs, vpids[i], dptr);
|
||||||
|
daemons->num_procs++;
|
||||||
}
|
}
|
||||||
if (NULL != node->daemon) {
|
if (NULL != node->daemon) {
|
||||||
OBJ_RELEASE(node->daemon);
|
OBJ_RELEASE(node->daemon);
|
||||||
@ -634,21 +638,18 @@ int orte_util_decode_daemon_nodemap(opal_byte_object_t *bo)
|
|||||||
} else {
|
} else {
|
||||||
node->oversubscribed = true;
|
node->oversubscribed = true;
|
||||||
}
|
}
|
||||||
if (ORTE_VPID_INVALID != vpids[i]) {
|
|
||||||
++num_daemons;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
free(vpids);
|
free(vpids);
|
||||||
free(oversub);
|
free(oversub);
|
||||||
|
|
||||||
orte_process_info.num_procs = num_daemons;
|
orte_process_info.num_procs = daemons->num_procs;
|
||||||
|
|
||||||
if (orte_process_info.max_procs < orte_process_info.num_procs) {
|
if (orte_process_info.max_procs < orte_process_info.num_procs) {
|
||||||
orte_process_info.max_procs = orte_process_info.num_procs;
|
orte_process_info.max_procs = orte_process_info.num_procs;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* update num_daemons */
|
/* update num_daemons */
|
||||||
orte_process_info.num_daemons = num_daemons;
|
orte_process_info.num_daemons = daemons->num_procs;
|
||||||
|
|
||||||
if (0 < opal_output_get_verbosity(orte_debug_output)) {
|
if (0 < opal_output_get_verbosity(orte_debug_output)) {
|
||||||
for (i=0; i < num_nodes; i++) {
|
for (i=0; i < num_nodes; i++) {
|
||||||
@ -658,7 +659,7 @@ int orte_util_decode_daemon_nodemap(opal_byte_object_t *bo)
|
|||||||
opal_output(5, "%s node[%d].name %s daemon %s",
|
opal_output(5, "%s node[%d].name %s daemon %s",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), i,
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), i,
|
||||||
(NULL == node->name) ? "NULL" : node->name,
|
(NULL == node->name) ? "NULL" : node->name,
|
||||||
ORTE_VPID_PRINT(node->daemon->name.vpid));
|
(NULL == node->daemon) ? "NONE" : ORTE_VPID_PRINT(node->daemon->name.vpid));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user