1
1

This is a first cut at fixing the problem of comm_spawn children being mapped onto the same nodes as their parents. I am not convinced the behavior implemented here is the long-term right one, but hopefully it will help alleviate the situation for now.

In this implementation, we begin mapping on the first node that has at least one slot available as measured by the slots_inuse versus the soft limit. If none of the nodes meet that criterion, we just start at the beginning of the node list since we are oversubscribed anyway.

Note that we ignore this logic if the user specifies a mapping - then it's just "user beware".

The real root cause of the problem is that we don't adjust sched_yield as we add processes onto a node. Hence, the node becomes oversubscribed and performance goes into the toilet. What we REALLY need to do to solve the problem is:

(a) modify the PLS components so they reuse the existing daemons, 

(b) create a way to tell a running process to adjust its sched_yield, and

(c) modify the ODLS components to update the sched_yield on a process per the new method

Until we do that, we will continue to have this problem - all this fix (and any subsequent one that focuses solely on the mapper) does is hopefully make it happen less often.

This commit was SVN r12145.
Этот коммит содержится в:
Ralph Castain 2006-10-17 19:35:00 +00:00
родитель 16769e64fe
Коммит 0c0fe022ff
2 изменённых файлов: 33 добавлений и 12 удалений

Просмотреть файл

@ -677,7 +677,7 @@ int orte_ras_base_node_assign(opal_list_t* nodes, orte_jobid_t jobid)
int rc;
orte_std_cntr_t num_values, i, j;
orte_ras_node_t* node;
char* jobid_str, *key;
char* jobid_str, *key=NULL;
num_values = opal_list_get_size(nodes);
if (0 >= num_values) {
@ -703,6 +703,14 @@ int orte_ras_base_node_assign(opal_list_t* nodes, orte_jobid_t jobid)
}
}
/* setup the allocation key */
if (ORTE_SUCCESS != (rc = orte_ns.convert_jobid_to_string(&jobid_str, jobid))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
asprintf(&key, "%s-%s", ORTE_NODE_SLOTS_ALLOC_KEY, jobid_str);
free(jobid_str);
for(i=0, item = opal_list_get_first(nodes);
i < num_values && item != opal_list_get_end(nodes);
i++, item = opal_list_get_next(item)) {
@ -711,11 +719,6 @@ int orte_ras_base_node_assign(opal_list_t* nodes, orte_jobid_t jobid)
if(node->node_slots_alloc == 0)
continue;
if (ORTE_SUCCESS != (rc = orte_ns.convert_jobid_to_string(&jobid_str, jobid))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* setup index/keys for this node */
rc = orte_schema.get_node_tokens(&(values[i]->tokens), &(values[i]->num_tokens), node->node_cellid, node->node_name);
if (ORTE_SUCCESS != rc) {
@ -725,16 +728,13 @@ int orte_ras_base_node_assign(opal_list_t* nodes, orte_jobid_t jobid)
}
/* setup node key/value pairs */
asprintf(&key, "%s-%s", ORTE_NODE_SLOTS_ALLOC_KEY, jobid_str);
free(jobid_str);
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(values[i]->keyvals[0]), key, ORTE_STD_CNTR, &(node->node_slots_alloc)))) {
ORTE_ERROR_LOG(rc);
free(key);
goto cleanup;
}
free(key);
}
/* try the insert */
if (ORTE_SUCCESS != (rc = orte_gpr.put(num_values, values))) {
ORTE_ERROR_LOG(rc);
@ -746,6 +746,8 @@ cleanup:
}
if (NULL != values) free(values);
if (NULL != key) free(key);
return rc;
}

Просмотреть файл

@ -163,7 +163,7 @@ static int map_app_by_slot(
* each time since we may remove nodes from the list (as they become fully
* used) as we cycle through the loop */
if(0 >= opal_list_get_size(nodes) ) {
/* No more nodes to allocate :( */
/* Everything is at max usage! :( */
opal_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:alloc-error",
true, app->num_procs, app->app);
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
@ -275,9 +275,28 @@ static int orte_rmaps_rr_map(orte_jobid_t jobid, char *ignore)
return rc;
}
/** initialize the cur_node_item to point to the first node in the list */
/* initialize the cur_node_item to point to the first node in the list that has
* an available slot. We need to check the slot availability since we may be
* mapping a child job onto the same nodes used by its parent. In that case,
* even though we may have used some slots on a node, the system still considers
* the node available due to oversubscription rules. However, we don't want to
* start at the beginning of the nodelist again as we will be oversubscribing the
* node and causing majorly poor performance
*/
for (cur_node_item = opal_list_get_first(&master_node_list);
cur_node_item != opal_list_get_end(&master_node_list);
cur_node_item = opal_list_get_next(cur_node_item)) {
node = (orte_ras_node_t*)cur_node_item;
if (node->node_slots > node->node_slots_inuse) {
goto MOVEON;
}
}
/* if we got here, then everyone is at or above the soft limit - just
* start with the first node on the list
*/
cur_node_item = opal_list_get_first(&master_node_list);
MOVEON:
/** construct the list to hold any nodes that get fully used during this
* mapping. We need to keep a record of these so we can update their
* information on the registry when we are done, but we want to remove