Fix the sched_yield problem for generic environments. We now determine and set sched_yield during mpi_init based on the following logical sequence:

1. if the user has specified sched_yield, we simply do what we are told 2. if they didn't specify anything, try to get the number of processors on this node. Note that we already now get the number of local procs in our job that are sharing this node - that now comes in through the proc callback and is stored in the ompi_proc_t structures. 3. if we can get the number of processors, compare that to the number of local procs from my job that are sharing my node. If the number of local procs exceeds the number of processors, then set sched_yield to true. If not, then be a hog and set sched_yield to false 4. if we can't get the number of processors, default to conservative behavior and set sched_yield to true. Note that I have not yet dealt with the need to dynamically adjust this setting as more processes are added via comm_spawn. So far, we are *only* looking within our own job. Given that we have now moved this logic to mpi_init (and away from the orteds), it isn't yet clear to me how a process will be informed about the number of procs in *other* jobs that are also sharing this node. Something to continue to ponder. This commit was SVN r13430.
2007-02-01 19:31:44 +00:00 · 2007-02-01 19:31:44 +00:00 · 3daf8b341b
--- a/ompi/runtime/ompi_mpi_init.c
+++ b/ompi/runtime/ompi_mpi_init.c
@ -212,6 +212,7 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
    bool timing = false;
    int param, value;
    struct timeval ompistart, ompistop;
+    int num_processors;

    /* Join the run-time environment - do the things that don't hit
       the registry */
@ -670,15 +671,31 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
       that code. */
    opal_progress_event_users_decrement();

-    /* Finish tuning the progress engine to run the way the user would
-       like us to run.  At this point,just adjust whether yield is
-       called when no events were processed in the progress engin. */
+    /* see if the user specified yield_when_idle - if so, use it */
    param = mca_base_param_find("mpi", NULL, "yield_when_idle");
    mca_base_param_lookup_int(param, &value);
    if (value < 0) {
-        /* if we got a bogus value, do the conservative thing... */
-        opal_progress_set_yield_when_idle(true);
+        /* nope - so let's figure out what we can/should do...
+         * first, get the number of processors - if we can't then
+         * we can't do anything but set conservative values
+         */
+        if (OPAL_SUCCESS == opal_paffinity_base_get_num_processors(&num_processors)) {
+            /* got the num_processors - compare that to the number of
+             * local procs in this job to decide if we are oversubscribed
+             */
+            if (ompi_proc_local_proc->num_local_procs > num_processors) {
+                /* oversubscribed - better yield */
+                opal_progress_set_yield_when_idle(true);
+            } else {
+                /* not oversubscribed - go ahead and be a hog! */
+                opal_progress_set_yield_when_idle(false);
+            }
+        } else {
+            /* couldn't get num_processors - be conservative */
+            opal_progress_set_yield_when_idle(true);
+        }
    } else {
+        /* yep, they specified it - so set idle accordingly */
        opal_progress_set_yield_when_idle(value == 0 ? false : true);
    }
    param = mca_base_param_find("mpi", NULL, "event_tick_rate");
--- a/ompi/runtime/ompi_mpi_params.c
+++ b/ompi/runtime/ompi_mpi_params.c
@ -85,7 +85,7 @@ int ompi_mpi_register_params(void)
       exactly/under-subscribed, or 1 when oversubscribed */
    mca_base_param_reg_int_name("mpi", "yield_when_idle", 
                                "Yield the processor when waiting for MPI communication (for MPI processes, will default to 1 when oversubscribing nodes)",
-                                false, false, 0, NULL);
+                                false, false, -1, NULL);
    mca_base_param_reg_int_name("mpi", "event_tick_rate", 
                                "How often to progress TCP communications (0 = never, otherwise specified in microseconds)",
                                false, false, -1, NULL);
--- a/orte/mca/odls/default/odls_default_module.c
+++ b/orte/mca/odls/default/odls_default_module.c
@ -119,17 +119,15 @@ int orte_odls_default_subscribe_launch_data(orte_jobid_t job, orte_gpr_notify_cb
    char *glob_keys[] = {
        ORTE_JOB_APP_CONTEXT_KEY,
        ORTE_JOB_VPID_START_KEY,
-        ORTE_JOB_VPID_RANGE_KEY,
-        ORTE_JOB_OVERSUBSCRIBE_OVERRIDE_KEY
+        ORTE_JOB_VPID_RANGE_KEY
    };
-    int num_glob_keys = 4;
+    int num_glob_keys = 3;
    char* keys[] = {
        ORTE_PROC_NAME_KEY,
        ORTE_PROC_APP_CONTEXT_KEY,
-        ORTE_NODE_NAME_KEY,
-        ORTE_NODE_OVERSUBSCRIBED_KEY
+        ORTE_NODE_NAME_KEY
    };
-    int num_keys = 4;
+    int num_keys = 3;
    int i, rc;
    
    /* get the job segment name */
@ -298,15 +296,16 @@ int orte_odls_default_get_add_procs_data(orte_gpr_notify_data_t **data,
             item = opal_list_get_next(item)) {
            proc = (orte_mapped_proc_t*)item;
            
-            if (ORTE_SUCCESS != (rc = orte_gpr.create_value(&value, 0, segment, 3, 1))) {
+            /* cannot have tokens as we use that as a flag to indicate these
+             * values did not come from the globals container
+             */
+            if (ORTE_SUCCESS != (rc = orte_gpr.create_value(&value, 0, segment, 3, 0))) {
                ORTE_ERROR_LOG(rc);
                OBJ_RELEASE(ndat);
                OBJ_RELEASE(value);
                return rc;
            }
            
-            value->tokens[0] = strdup("bogus"); /* must have at least one token */
-                                      
            if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(value->keyvals[0]),
                                                            ORTE_PROC_NAME_KEY,
                                                            ORTE_NAME, &proc->name))) {
@ -631,9 +630,6 @@ static int odls_default_fork_local_proc(
    orte_odls_child_t *child,
    orte_vpid_t vpid_start,
    orte_vpid_t vpid_range,
-    bool want_processor,
-    size_t processor,
-    bool oversubscribed,
    char **base_environ)
 {
    pid_t pid;
@ -751,36 +747,6 @@ static int odls_default_fork_local_proc(
        opal_unsetenv(param, &environ_copy);
        free(param);

-        /* setup yield schedule and processor affinity
-         * We default here to always setting the affinity processor if we want
-         * it. The processor affinity system then determines
-         * if processor affinity is enabled/requested - if so, it then uses
-         * this value to select the process to which the proc is "assigned".
-         * Otherwise, the paffinity subsystem just ignores this value anyway
-         */
-        if (oversubscribed) {
-            param = mca_base_param_environ_variable("mpi", NULL, "yield_when_idle");
-            opal_setenv(param, "1", false, &environ_copy);
-       } else {
-            param = mca_base_param_environ_variable("mpi", NULL, "yield_when_idle");
-            opal_setenv(param, "0", false, &environ_copy);
-        }
-        free(param);
-        
-        if (want_processor) {
-            param = mca_base_param_environ_variable("mpi", NULL,
-                                                    "paffinity_processor");
-            asprintf(&param2, "%lu", (unsigned long) processor);
-            opal_setenv(param, param2, false, &environ_copy);
-            free(param);
-            free(param2);
-        } else {
-            param = mca_base_param_environ_variable("mpi", NULL,
-                                                    "paffinity_processor");
-            opal_unsetenv(param, &environ_copy);
-            free(param);
-        }
-        
        /* setup universe info */
        if (NULL != orte_universe_info.name) {
            param = mca_base_param_environ_variable("universe", NULL, NULL);
@ -936,8 +902,6 @@ int orte_odls_default_launch_local_procs(orte_gpr_notify_data_t *data, char **ba
    opal_list_t app_context_list;
    orte_odls_child_t *child;
    odls_default_app_context_t *app_item;
-    int num_processors;
-    bool oversubscribed=false, want_processor, *bptr, override_oversubscribed=false;
    opal_list_item_t *item, *item2;

    /* parse the returned data to create the required structures
@ -1017,16 +981,7 @@ int orte_odls_default_launch_local_procs(orte_gpr_notify_data_t *data, char **ba
                        app_item->app_context = app;
                        opal_list_append(&app_context_list, &app_item->super);
                        kval->value->data = NULL;  /* protect the data storage from later release */
-                    }
-                    if (strcmp(kval->key, ORTE_JOB_OVERSUBSCRIBE_OVERRIDE_KEY) == 0) {
-                        /* this can only occur once, so just store it */
-                        if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&bptr, kval->value, ORTE_BOOL))) {
-                            ORTE_ERROR_LOG(rc);
-                            return rc;
-                        }
-                        override_oversubscribed = *bptr;
-                        continue;
-                    }
+                   }
                } /* end for loop to process global data */
            } else {
                /* this must have come from one of the process containers, so it must
@ -1065,14 +1020,6 @@ int orte_odls_default_launch_local_procs(orte_gpr_notify_data_t *data, char **ba
                                    child->app_idx = *sptr;  /* save the index into the app_context objects */
                                    continue;
                                }
-                                if(strcmp(kval->key, ORTE_NODE_OVERSUBSCRIBED_KEY) == 0) {
-                                    if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&bptr, kval->value, ORTE_BOOL))) {
-                                        ORTE_ERROR_LOG(rc);
-                                        return rc;
-                                    }
-                                    oversubscribed = *bptr;
-                                    continue;
-                                }
                            } /* kv2 */
                            /* protect operation on the global list of children */
                            OPAL_THREAD_LOCK(&orte_odls_default.mutex);
@ -1087,53 +1034,6 @@ int orte_odls_default_launch_local_procs(orte_gpr_notify_data_t *data, char **ba
        } /* for j */
    }

-    /* setup for processor affinity. If there are enough physical processors on this node, then
-     * we indicate which processor each process should be assigned to, IFF the user has requested
-     * processor affinity be used - the paffinity subsystem will make that final determination. All
-     * we do here is indicate that we should do the definitions just in case paffinity is active
-     */
-    if (ORTE_SUCCESS != opal_paffinity_base_get_num_processors(&num_processors)) {
-        /* if we cannot find the number of local processors, then default to conservative
-         * settings
-         */
-        want_processor = false;  /* default to not being a hog */
-        /* leave oversubscribed alone */
-        opal_output(orte_odls_globals.output,
-                    "odls: could not get number of processors - using conservative settings");
-    } else {
-        /* only do this if we can actually get info on the number of processors */
-        if (opal_list_get_size(&orte_odls_default.children) > (size_t)num_processors) {
-            want_processor = false;
-        } else {
-            want_processor = true;
-        }
-        
-        /* now let's deal with the oversubscribed flag - and the use-case where a hostfile or some
-        * other non-guaranteed-accurate method was used to inform us about our allocation. Since
-        * the information on the number of slots on this node could have been incorrect, we need
-        * to check it against the local number of processors to ensure we don't overload them
-        */
-        if (override_oversubscribed) {
-            opal_output(orte_odls_globals.output, "odls: overriding oversubscription");
-            if (opal_list_get_size(&orte_odls_default.children) > (size_t)num_processors) {
-                /* if the #procs > #processors, declare us oversubscribed regardless
-                * of what the mapper claimed - the user may have told us something
-                * incorrect
-                */
-                oversubscribed = true;
-            } else {
-                /* likewise, if there are more processors here than we were told,
-                * declare us to not be oversubscribed so we can be aggressive. This
-                * covers the case where the user didn't tell us anything about the
-                * number of available slots, so we defaulted to a value of 1
-                */
-                oversubscribed = false;
-            }
-        }
-    }
-    opal_output(orte_odls_globals.output, "odls: oversubscribed set to %s want_processor set to %s",
-                oversubscribed ? "true" : "false", want_processor ? "true" : "false");
-
    /* okay, now let's launch our local procs using a fork/exec */
    i = 0;
    /* protect operations involving the global list of children */
@ -1187,9 +1087,7 @@ DOFORK:
        OPAL_THREAD_UNLOCK(&orte_odls_default.mutex);
        
        if (ORTE_SUCCESS != (rc = odls_default_fork_local_proc(app, child, start,
-                                                               range, want_processor,
-                                                               i, oversubscribed,
-                                                               base_environ))) {
+                                                               range, base_environ))) {
            ORTE_ERROR_LOG(rc);
            orte_smr.set_proc_state(child->name, ORTE_PROC_STATE_ABORTED, 0);
            opal_condition_signal(&orte_odls_default.cond);
--- a/orte/mca/pls/xcpu/pls_xcpu.c
+++ b/orte/mca/pls/xcpu/pls_xcpu.c
@ -222,12 +222,7 @@ pls_xcpu_setup_env(char ***e)

 	free(param);
 	free(var);
-#if 0
-	/* FixMe: do this only when we oversubscribe */
-        var = mca_base_param_environ_variable("mpi", NULL, "yield_when_idle");
-        opal_setenv(var, "1", true, e);
-	free(var);
-#endif
+
 	/* merge in environment */
 	env = opal_environ_merge(*e, environ);
 	opal_argv_free(*e);
--- a/orte/tools/orted/orted.c
+++ b/orte/tools/orted/orted.c
@ -183,10 +183,6 @@ opal_cmd_line_init_t orte_cmd_line_opts[] = {
      &orted_globals.uri_pipe, OPAL_CMD_LINE_TYPE_INT,
      "Report this process' uri on indicated pipe"},

-    { NULL, NULL, NULL, '\0', NULL, "mpi-call-yield", 1,
-      &orted_globals.mpi_call_yield, OPAL_CMD_LINE_TYPE_INT,
-      "Have MPI (or similar) applications call yield when idle" },
-
    /* End of list */
    { NULL, NULL, NULL, '\0', NULL, NULL, 0,
      NULL, OPAL_CMD_LINE_TYPE_NULL, NULL }
@ -407,11 +403,8 @@ int main(int argc, char *argv[])

    /* check to see if I'm a bootproxy */
    if (orted_globals.bootproxy) { /* perform bootproxy-specific things */
-        if (orted_globals.mpi_call_yield > 0) {
-            char *var;
-            var = mca_base_param_environ_variable("mpi", NULL, "yield_when_idle");
-            opal_setenv(var, "1", true, &environ);
-        }
+        /* a daemon should *always* yield the processor when idle */
+        opal_progress_set_yield_when_idle(true);

        /* attach a subscription to the orted standard trigger so I can get
         * information on the processes I am to locally launch as soon as all
--- a/orte/tools/orted/orted.h
+++ b/orte/tools/orted/orted.h
@ -48,7 +48,6 @@ typedef struct {
    opal_condition_t condition;
    bool exit_condition;
    bool spin;
-    int mpi_call_yield;
    int reap;
 } orted_globals_t;