/* * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. * Copyright (c) 2017-2018 Cisco Systems, Inc. All rights reserved * Copyright (c) 2017 Inria. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ */ #define OPAL_HWLOC_WANT_SHMEM 1 #include "orte_config.h" #include "orte/constants.h" #include "orte/types.h" #include #include #include #include #include #ifdef HAVE_UNISTD_H #include #endif /* HAVE_UNISTD_H */ #include #include #ifdef HAVE_SYS_STAT_H #include #endif #if HAVE_FCNTL_H #include #endif #include "opal/class/opal_list.h" #include "opal/dss/dss_types.h" #include "opal/mca/hwloc/hwloc-internal.h" #include "opal/mca/pmix/pmix_types.h" #include "opal/util/argv.h" #include "opal/util/fd.h" #include "opal/util/opal_environ.h" #include "opal/util/path.h" #include "orte/util/show_help.h" #include "orte/util/error_strings.h" #include "orte/runtime/orte_globals.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/rmaps/rmaps_types.h" #include "orte/mca/rtc/base/base.h" #include "rtc_hwloc.h" static int init(void); static void finalize(void); static void assign(orte_job_t *jdata); static void set(orte_job_t *jdata, orte_proc_t *proc, char ***environ_copy, int write_fd); orte_rtc_base_module_t orte_rtc_hwloc_module = { .init = init, .finalize = finalize, .assign = assign, .set = set }; #if HWLOC_API_VERSION >= 0x20000 static size_t shmemsize = 0; static size_t shmemaddr; static char *shmemfile = NULL; static int shmemfd = -1; static int parse_map_line(const char *line, unsigned long *beginp, unsigned long *endp, orte_rtc_hwloc_vm_map_kind_t *kindp); static int use_hole(unsigned long holebegin, unsigned long holesize, unsigned long *addrp, unsigned long size); static int find_hole(orte_rtc_hwloc_vm_hole_kind_t hkind, size_t *addrp, size_t size); static int enough_space(const char *filename, size_t space_req, uint64_t *space_avail, bool *result); #endif static int init(void) { #if HWLOC_API_VERSION >= 0x20000 int rc; bool space_available = false; uint64_t amount_space_avail = 0; /* ensure we have the topology */ if (OPAL_SUCCESS != (rc = opal_hwloc_base_get_topology())) { return rc; } if (VM_HOLE_NONE == mca_rtc_hwloc_component.kind) { return ORTE_SUCCESS; } /* get the size of the topology shared memory segment */ if (0 != hwloc_shmem_topology_get_length(opal_hwloc_topology, &shmemsize, 0)) { opal_output_verbose(2, orte_rtc_base_framework.framework_output, "%s hwloc topology shmem not available", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); return ORTE_SUCCESS; } if (ORTE_SUCCESS != (rc = find_hole(mca_rtc_hwloc_component.kind, &shmemaddr, shmemsize))) { /* we couldn't find a hole, so don't use the shmem support */ if (4 < opal_output_get_verbosity(orte_rtc_base_framework.framework_output)) { FILE *file = fopen("/proc/self/maps", "r"); if (file) { char line[256]; opal_output(0, "%s Dumping /proc/self/maps", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); while (fgets(line, sizeof(line), file) != NULL) { char *end = strchr(line, '\n'); if (end) { *end = '\0'; } opal_output(0, "%s", line); } fclose(file); } } return ORTE_SUCCESS; } /* create the shmem file in our session dir so it * will automatically get cleaned up */ asprintf(&shmemfile, "%s/hwloc.sm", orte_process_info.jobfam_session_dir); /* let's make sure we have enough space for the backing file */ if (OPAL_SUCCESS != (rc = enough_space(shmemfile, shmemsize, &amount_space_avail, &space_available))) { opal_output_verbose(2, orte_rtc_base_framework.framework_output, "%s an error occurred while determining " "whether or not %s could be created for topo shmem.", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), shmemfile); free(shmemfile); shmemfile = NULL; return ORTE_SUCCESS; } if (!space_available) { if (1 < opal_output_get_verbosity(orte_rtc_base_framework.framework_output)) { orte_show_help("help-orte-rtc-hwloc.txt", "target full", true, shmemfile, orte_process_info.nodename, (unsigned long)shmemsize, (unsigned long long)amount_space_avail); } free(shmemfile); shmemfile = NULL; return ORTE_SUCCESS; } /* enough space is available, so create the segment */ if (-1 == (shmemfd = open(shmemfile, O_CREAT | O_RDWR, 0600))) { int err = errno; if (1 < opal_output_get_verbosity(orte_rtc_base_framework.framework_output)) { orte_show_help("help-orte-rtc-hwloc.txt", "sys call fail", true, orte_process_info.nodename, "open(2)", "", strerror(err), err); } free(shmemfile); shmemfile = NULL; return ORTE_SUCCESS; } /* ensure nobody inherits this fd */ opal_fd_set_cloexec(shmemfd); /* populate the shmem segment with the topology */ if (0 != (rc = hwloc_shmem_topology_write(opal_hwloc_topology, shmemfd, 0, (void*)shmemaddr, shmemsize, 0))) { opal_output_verbose(2, orte_rtc_base_framework.framework_output, "%s an error occurred while writing topology to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), shmemfile); unlink(shmemfile); free(shmemfile); shmemfile = NULL; close(shmemfd); shmemfd = -1; return ORTE_SUCCESS; } #endif return ORTE_SUCCESS; } static void finalize(void) { #if HWLOC_API_VERSION >= 0x20000 if (NULL != shmemfile) { unlink(shmemfile); free(shmemfile); } if (0 <= shmemfd) { close(shmemfd); } #endif return; } static void assign(orte_job_t *jdata) { #if HWLOC_API_VERSION >= 0x20000 opal_list_t *cache; opal_value_t *kv; if (VM_HOLE_NONE == mca_rtc_hwloc_component.kind || NULL == shmemfile) { return; } /* add the shmem address and size to the job-level info that * will be provided to the proc upon registration */ cache = NULL; if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_INFO_CACHE, (void**)&cache, OPAL_PTR) || NULL == cache) { cache = OBJ_NEW(opal_list_t); orte_set_attribute(&jdata->attributes, ORTE_JOB_INFO_CACHE, ORTE_ATTR_LOCAL, cache, OPAL_PTR); } opal_output_verbose(2, orte_rtc_base_framework.framework_output, "FILE %s ADDR %lx SIZE %lx", shmemfile, (unsigned long)shmemaddr, (unsigned long)shmemsize); kv = OBJ_NEW(opal_value_t); kv->key = strdup(OPAL_PMIX_HWLOC_SHMEM_FILE); kv->type = OPAL_STRING; kv->data.string = strdup(shmemfile); opal_list_append(cache, &kv->super); kv = OBJ_NEW(opal_value_t); kv->key = strdup(OPAL_PMIX_HWLOC_SHMEM_ADDR); kv->type = OPAL_SIZE; kv->data.size = shmemaddr; opal_list_append(cache, &kv->super); kv = OBJ_NEW(opal_value_t); kv->key = strdup(OPAL_PMIX_HWLOC_SHMEM_SIZE); kv->type = OPAL_SIZE; kv->data.size = shmemsize; opal_list_append(cache, &kv->super); #endif } static void set(orte_job_t *jobdat, orte_proc_t *child, char ***environ_copy, int write_fd) { hwloc_cpuset_t cpuset; hwloc_obj_t root; opal_hwloc_topo_data_t *sum; orte_app_context_t *context; int rc=ORTE_ERROR; char *msg, *param; char *cpu_bitmap; opal_output_verbose(2, orte_rtc_base_framework.framework_output, "%s hwloc:set on child %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == child) ? "NULL" : ORTE_NAME_PRINT(&child->name)); if (NULL == jobdat || NULL == child) { /* nothing for us to do */ opal_output_verbose(2, orte_rtc_base_framework.framework_output, "%s hwloc:set jobdat %s child %s - nothing to do", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == jobdat) ? "NULL" : ORTE_JOBID_PRINT(jobdat->jobid), (NULL == child) ? "NULL" : ORTE_NAME_PRINT(&child->name)); return; } context = (orte_app_context_t*)opal_pointer_array_get_item(jobdat->apps, child->app_idx); /* Set process affinity, if given */ cpu_bitmap = NULL; if (!orte_get_attribute(&child->attributes, ORTE_PROC_CPU_BITMAP, (void**)&cpu_bitmap, OPAL_STRING) || NULL == cpu_bitmap || 0 == strlen(cpu_bitmap)) { /* if the daemon is bound, then we need to "free" this proc */ if (NULL != orte_daemon_cores) { root = hwloc_get_root_obj(opal_hwloc_topology); if (NULL == root->userdata) { orte_rtc_base_send_warn_show_help(write_fd, "help-orte-odls-default.txt", "incorrectly bound", orte_process_info.nodename, context->app, __FILE__, __LINE__); } sum = (opal_hwloc_topo_data_t*)root->userdata; /* bind this proc to all available processors */ rc = hwloc_set_cpubind(opal_hwloc_topology, sum->available, 0); /* if we got an error and this wasn't a default binding policy, then report it */ if (rc < 0 && OPAL_BINDING_POLICY_IS_SET(jobdat->map->binding)) { if (errno == ENOSYS) { msg = "hwloc indicates cpu binding not supported"; } else if (errno == EXDEV) { msg = "hwloc indicates cpu binding cannot be enforced"; } else { char *tmp; (void)hwloc_bitmap_list_asprintf(&tmp, sum->available); asprintf(&msg, "hwloc_set_cpubind returned \"%s\" for bitmap \"%s\"", opal_strerror(rc), tmp); free(tmp); } if (OPAL_BINDING_REQUIRED(jobdat->map->binding)) { /* If binding is required, send an error up the pipe (which exits -- it doesn't return). */ orte_rtc_base_send_error_show_help(write_fd, 1, "help-orte-odls-default.txt", "binding generic error", orte_process_info.nodename, context->app, msg, __FILE__, __LINE__); } else { orte_rtc_base_send_warn_show_help(write_fd, "help-orte-odls-default.txt", "not bound", orte_process_info.nodename, context->app, msg, __FILE__, __LINE__); return; } } } if (0 == rc && opal_hwloc_report_bindings) { opal_output(0, "MCW rank %d is not bound (or bound to all available processors)", child->name.vpid); /* avoid reporting it twice */ (void) mca_base_var_env_name ("hwloc_base_report_bindings", ¶m); opal_unsetenv(param, environ_copy); free(param); } } else { /* convert the list to a cpuset */ cpuset = hwloc_bitmap_alloc(); if (0 != (rc = hwloc_bitmap_list_sscanf(cpuset, cpu_bitmap))) { /* See comment above about "This may be a small memory leak" */ asprintf(&msg, "hwloc_bitmap_sscanf returned \"%s\" for the string \"%s\"", opal_strerror(rc), cpu_bitmap); if (NULL == msg) { msg = "failed to convert bitmap list to hwloc bitmap"; } if (OPAL_BINDING_REQUIRED(jobdat->map->binding) && OPAL_BINDING_POLICY_IS_SET(jobdat->map->binding)) { /* If binding is required and a binding directive was explicitly * given (i.e., we are not binding due to a default policy), * send an error up the pipe (which exits -- it doesn't return). */ orte_rtc_base_send_error_show_help(write_fd, 1, "help-orte-odls-default.txt", "binding generic error", orte_process_info.nodename, context->app, msg, __FILE__, __LINE__); } else { orte_rtc_base_send_warn_show_help(write_fd, "help-orte-odls-default.txt", "not bound", orte_process_info.nodename, context->app, msg, __FILE__, __LINE__); free(cpu_bitmap); return; } } /* bind as specified */ rc = hwloc_set_cpubind(opal_hwloc_topology, cpuset, 0); /* if we got an error and this wasn't a default binding policy, then report it */ if (rc < 0 && OPAL_BINDING_POLICY_IS_SET(jobdat->map->binding)) { char *tmp = NULL; if (errno == ENOSYS) { msg = "hwloc indicates cpu binding not supported"; } else if (errno == EXDEV) { msg = "hwloc indicates cpu binding cannot be enforced"; } else { asprintf(&msg, "hwloc_set_cpubind returned \"%s\" for bitmap \"%s\"", opal_strerror(rc), cpu_bitmap); } if (OPAL_BINDING_REQUIRED(jobdat->map->binding)) { /* If binding is required, send an error up the pipe (which exits -- it doesn't return). */ orte_rtc_base_send_error_show_help(write_fd, 1, "help-orte-odls-default.txt", "binding generic error", orte_process_info.nodename, context->app, msg, __FILE__, __LINE__); } else { orte_rtc_base_send_warn_show_help(write_fd, "help-orte-odls-default.txt", "not bound", orte_process_info.nodename, context->app, msg, __FILE__, __LINE__); if (NULL != tmp) { free(tmp); free(msg); } return; } if (NULL != tmp) { free(tmp); free(msg); } } if (0 == rc && opal_hwloc_report_bindings) { char tmp1[1024], tmp2[1024]; hwloc_cpuset_t mycpus; /* get the cpus we are bound to */ mycpus = hwloc_bitmap_alloc(); if (hwloc_get_cpubind(opal_hwloc_topology, mycpus, HWLOC_CPUBIND_PROCESS) < 0) { opal_output(0, "MCW rank %d is not bound", child->name.vpid); } else { if (OPAL_ERR_NOT_BOUND == opal_hwloc_base_cset2str(tmp1, sizeof(tmp1), opal_hwloc_topology, mycpus)) { opal_output(0, "MCW rank %d is not bound (or bound to all available processors)", child->name.vpid); } else { opal_hwloc_base_cset2mapstr(tmp2, sizeof(tmp2), opal_hwloc_topology, mycpus); opal_output(0, "MCW rank %d bound to %s: %s", child->name.vpid, tmp1, tmp2); } } hwloc_bitmap_free(mycpus); /* avoid reporting it twice */ (void) mca_base_var_env_name ("hwloc_base_report_bindings", ¶m); opal_unsetenv(param, environ_copy); free(param); } /* set memory affinity policy - if we get an error, don't report * anything unless the user actually specified the binding policy */ rc = opal_hwloc_base_set_process_membind_policy(); if (ORTE_SUCCESS != rc && OPAL_BINDING_POLICY_IS_SET(jobdat->map->binding)) { if (errno == ENOSYS) { msg = "hwloc indicates memory binding not supported"; } else if (errno == EXDEV) { msg = "hwloc indicates memory binding cannot be enforced"; } else { msg = "failed to bind memory"; } if (OPAL_HWLOC_BASE_MBFA_ERROR == opal_hwloc_base_mbfa) { /* If binding is required, send an error up the pipe (which exits -- it doesn't return). */ orte_rtc_base_send_error_show_help(write_fd, 1, "help-orte-odls-default.txt", "memory binding error", orte_process_info.nodename, context->app, msg, __FILE__, __LINE__); } else { orte_rtc_base_send_warn_show_help(write_fd, "help-orte-odls-default.txt", "memory not bound", orte_process_info.nodename, context->app, msg, __FILE__, __LINE__); free(cpu_bitmap); return; } } } if (NULL != cpu_bitmap) { free(cpu_bitmap); } } #if HWLOC_API_VERSION >= 0x20000 static int parse_map_line(const char *line, unsigned long *beginp, unsigned long *endp, orte_rtc_hwloc_vm_map_kind_t *kindp) { const char *tmp = line, *next; unsigned long value; /* "beginaddr-endaddr " */ value = strtoull(tmp, (char **) &next, 16); if (next == tmp) { return ORTE_ERROR; } *beginp = (unsigned long) value; if (*next != '-') { return ORTE_ERROR; } tmp = next + 1; value = strtoull(tmp, (char **) &next, 16); if (next == tmp) { return ORTE_ERROR; } *endp = (unsigned long) value; tmp = next; if (*next != ' ') { return ORTE_ERROR; } tmp = next + 1; /* look for ending absolute path */ next = strchr(tmp, '/'); if (next) { *kindp = VM_MAP_FILE; } else { /* look for ending special tag [foo] */ next = strchr(tmp, '['); if (next) { if (!strncmp(next, "[heap]", 6)) { *kindp = VM_MAP_HEAP; } else if (!strncmp(next, "[stack]", 7)) { *kindp = VM_MAP_STACK; } else { char *end; if ((end = strchr(next, '\n')) != NULL) { *end = '\0'; } opal_output_verbose(80, orte_rtc_base_framework.framework_output, "Found special VMA \"%s\" before stack", next); *kindp = VM_MAP_OTHER; } } else { *kindp = VM_MAP_ANONYMOUS; } } return ORTE_SUCCESS; } #define ALIGN2MB (2*1024*1024UL) static int use_hole(unsigned long holebegin, unsigned long holesize, unsigned long *addrp, unsigned long size) { unsigned long aligned; unsigned long middle = holebegin+holesize/2; opal_output_verbose(80, orte_rtc_base_framework.framework_output, "looking in hole [0x%lx-0x%lx] size %lu (%lu MB) for %lu (%lu MB)\n", holebegin, holebegin+holesize, holesize, holesize>>20, size, size>>20); if (holesize < size) { return ORTE_ERROR; } /* try to align the middle of the hole on 64MB for POWER's 64k-page PMD */ #define ALIGN64MB (64*1024*1024UL) aligned = (middle + ALIGN64MB) & ~(ALIGN64MB-1); if (aligned + size <= holebegin + holesize) { opal_output_verbose(80, orte_rtc_base_framework.framework_output, "aligned [0x%lx-0x%lx] (middle 0x%lx) to 0x%lx for 64MB\n", holebegin, holebegin+holesize, middle, aligned); opal_output_verbose(80, orte_rtc_base_framework.framework_output, " there are %lu MB free before and %lu MB free after\n", (aligned-holebegin)>>20, (holebegin+holesize-aligned-size)>>20); *addrp = aligned; return ORTE_SUCCESS; } /* try to align the middle of the hole on 2MB for x86 PMD */ aligned = (middle + ALIGN2MB) & ~(ALIGN2MB-1); if (aligned + size <= holebegin + holesize) { opal_output_verbose(80, orte_rtc_base_framework.framework_output, "aligned [0x%lx-0x%lx] (middle 0x%lx) to 0x%lx for 2MB\n", holebegin, holebegin+holesize, middle, aligned); opal_output_verbose(80, orte_rtc_base_framework.framework_output, " there are %lu MB free before and %lu MB free after\n", (aligned-holebegin)>>20, (holebegin+holesize-aligned-size)>>20); *addrp = aligned; return ORTE_SUCCESS; } /* just use the end of the hole */ *addrp = holebegin + holesize - size; opal_output_verbose(80, orte_rtc_base_framework.framework_output, "using the end of hole starting at 0x%lx\n", *addrp); opal_output_verbose(80, orte_rtc_base_framework.framework_output, " there are %lu MB free before\n", (*addrp-holebegin)>>20); return ORTE_SUCCESS; } static int find_hole(orte_rtc_hwloc_vm_hole_kind_t hkind, size_t *addrp, size_t size) { unsigned long biggestbegin = 0; unsigned long biggestsize = 0; unsigned long prevend = 0; orte_rtc_hwloc_vm_map_kind_t prevmkind = VM_MAP_OTHER; int in_libs = 0; FILE *file; char line[96]; file = fopen("/proc/self/maps", "r"); if (!file) { return ORTE_ERROR; } while (fgets(line, sizeof(line), file) != NULL) { unsigned long begin=0, end=0; orte_rtc_hwloc_vm_map_kind_t mkind=VM_MAP_OTHER; if (!parse_map_line(line, &begin, &end, &mkind)) { opal_output_verbose(90, orte_rtc_base_framework.framework_output, "found %s from 0x%lx to 0x%lx\n", mkind == VM_MAP_HEAP ? "HEAP" : mkind == VM_MAP_STACK ? "STACK" : mkind == VM_MAP_OTHER ? "OTHER" : mkind == VM_MAP_FILE ? "FILE" : mkind == VM_MAP_ANONYMOUS ? "ANON" : "unknown", begin, end); switch (hkind) { case VM_HOLE_BEGIN: fclose(file); return use_hole(0, begin, addrp, size); case VM_HOLE_AFTER_HEAP: if (prevmkind == VM_MAP_HEAP && mkind != VM_MAP_HEAP) { /* only use HEAP when there's no other HEAP after it * (there can be several of them consecutively). */ fclose(file); return use_hole(prevend, begin-prevend, addrp, size); } break; case VM_HOLE_BEFORE_STACK: if (mkind == VM_MAP_STACK) { fclose(file); return use_hole(prevend, begin-prevend, addrp, size); } break; case VM_HOLE_IN_LIBS: /* see if we are between heap and stack */ if (prevmkind == VM_MAP_HEAP) { in_libs = 1; } if (mkind == VM_MAP_STACK) { in_libs = 0; } if (!in_libs) { /* we're not in libs, ignore this entry */ break; } /* we're in libs, consider this entry for searching the biggest hole below */ /* fallthrough */ case VM_HOLE_BIGGEST: if (begin-prevend > biggestsize) { opal_output_verbose(90, orte_rtc_base_framework.framework_output, "new biggest 0x%lx - 0x%lx = %lu (%lu MB)\n", prevend, begin, begin-prevend, (begin-prevend)>>20); biggestbegin = prevend; biggestsize = begin-prevend; } break; default: assert(0); } } while (!strchr(line, '\n')) { if (!fgets(line, sizeof(line), file)) { goto done; } } if (mkind == VM_MAP_STACK) { /* Don't go beyond the stack. Other VMAs are special (vsyscall, vvar, vdso, etc), * There's no spare room there. And vsyscall is even above the userspace limit. */ break; } prevend = end; prevmkind = mkind; } done: fclose(file); if (hkind == VM_HOLE_IN_LIBS || hkind == VM_HOLE_BIGGEST) { return use_hole(biggestbegin, biggestsize, addrp, size); } return ORTE_ERROR; } static int enough_space(const char *filename, size_t space_req, uint64_t *space_avail, bool *result) { uint64_t avail = 0; size_t fluff = (size_t)(.05 * space_req); bool enough = false; char *last_sep = NULL; /* the target file name is passed here, but we need to check the parent * directory. store it so we can extract that info later. */ char *target_dir = strdup(filename); int rc; if (NULL == target_dir) { rc = OPAL_ERR_OUT_OF_RESOURCE; goto out; } /* get the parent directory */ last_sep = strrchr(target_dir, OPAL_PATH_SEP[0]); *last_sep = '\0'; /* now check space availability */ if (OPAL_SUCCESS != (rc = opal_path_df(target_dir, &avail))) { OPAL_OUTPUT_VERBOSE( (70, orte_rtc_base_framework.framework_output, "WARNING: opal_path_df failure!") ); goto out; } /* do we have enough space? */ if (avail >= space_req + fluff) { enough = true; } else { OPAL_OUTPUT_VERBOSE( (70, orte_rtc_base_framework.framework_output, "WARNING: not enough space on %s to meet request!" "available: %"PRIu64 "requested: %lu", target_dir, avail, (unsigned long)space_req + fluff) ); } out: if (NULL != target_dir) { free(target_dir); } *result = enough; *space_avail = avail; return rc; } #endif