From 8d1be27a1e98940e21e87b75251f6996668490bc Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Sat, 25 Aug 2018 07:45:28 -0700 Subject: [PATCH] Deal with special case during cleanup In some scenarios, we can have a daemon sharing the node with mpirun. In those cases, we need to avoid race conditions in cleanup Signed-off-by: Ralph Castain --- orte/util/session_dir.c | 54 +++++++++++++++++++++++++++++------------ 1 file changed, 39 insertions(+), 15 deletions(-) diff --git a/orte/util/session_dir.c b/orte/util/session_dir.c index 90f464fefb..657cec6586 100644 --- a/orte/util/session_dir.c +++ b/orte/util/session_dir.c @@ -12,7 +12,7 @@ * Copyright (c) 2014 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. - * Copyright (c) 2015-2017 Intel, Inc. All rights reserved. + * Copyright (c) 2015-2018 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -61,6 +61,7 @@ #include "orte/util/show_help.h" #include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/ras/base/base.h" #include "orte/runtime/runtime.h" #include "orte/runtime/orte_globals.h" @@ -370,6 +371,16 @@ cleanup: int orte_session_dir_cleanup(orte_jobid_t jobid) { + /* special case - if a daemon is colocated with mpirun, + * then we let mpirun do the rest to avoid a race + * condition. this scenario always results in the rank=1 + * daemon colocated with mpirun */ + if (orte_ras_base.launch_orted_on_hn && + ORTE_PROC_IS_DAEMON && + 1 == ORTE_PROC_MY_NAME->vpid) { + return ORTE_SUCCESS; + } + if (!orte_create_session_dirs || orte_process_info.rm_session_dirs ) { /* we haven't created them or RM will clean them up for us*/ return ORTE_SUCCESS; @@ -386,6 +397,7 @@ orte_session_dir_cleanup(orte_jobid_t jobid) return ORTE_ERR_NOT_INITIALIZED; } + /* recursively blow the whole session away for our job family, * saving only output files */ @@ -461,20 +473,6 @@ orte_session_dir_finalize(orte_process_name_t *proc) opal_os_dirpath_destroy(orte_process_info.proc_session_dir, false, orte_dir_check_file); - opal_os_dirpath_destroy(orte_process_info.job_session_dir, - false, orte_dir_check_file); - /* only remove the jobfam session dir if we are the - * local daemon and we are finalizing our own session dir */ - if ((ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON) && - (ORTE_PROC_MY_NAME == proc)) { - opal_os_dirpath_destroy(orte_process_info.jobfam_session_dir, - false, orte_dir_check_file); - } - - if( NULL != orte_process_info.top_session_dir ){ - opal_os_dirpath_destroy(orte_process_info.top_session_dir, - false, orte_dir_check_file); - } if (opal_os_dirpath_is_empty(orte_process_info.proc_session_dir)) { if (orte_debug_flag) { @@ -492,6 +490,32 @@ orte_session_dir_finalize(orte_process_name_t *proc) } } + /* special case - if a daemon is colocated with mpirun, + * then we let mpirun do the rest to avoid a race + * condition. this scenario always results in the rank=1 + * daemon colocated with mpirun */ + if (orte_ras_base.launch_orted_on_hn && + ORTE_PROC_IS_DAEMON && + 1 == ORTE_PROC_MY_NAME->vpid) { + return ORTE_SUCCESS; + } + + opal_os_dirpath_destroy(orte_process_info.job_session_dir, + false, orte_dir_check_file); + + /* only remove the jobfam session dir if we are the + * local daemon and we are finalizing our own session dir */ + if ((ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON) && + (ORTE_PROC_MY_NAME == proc)) { + opal_os_dirpath_destroy(orte_process_info.jobfam_session_dir, + false, orte_dir_check_file); + } + + if( NULL != orte_process_info.top_session_dir ){ + opal_os_dirpath_destroy(orte_process_info.top_session_dir, + false, orte_dir_check_file); + } + if (opal_os_dirpath_is_empty(orte_process_info.job_session_dir)) { if (orte_debug_flag) { opal_output(0, "sess_dir_finalize: found job session dir empty - deleting");