1
1

Deal with special case during cleanup

In some scenarios, we can have a daemon sharing the node with mpirun. In
those cases, we need to avoid race conditions in cleanup

Signed-off-by: Ralph Castain <rhc@open-mpi.org>
Этот коммит содержится в:
Ralph Castain 2018-08-25 07:45:28 -07:00
родитель a0ea197e97
Коммит 8d1be27a1e

Просмотреть файл

@ -12,7 +12,7 @@
* Copyright (c) 2014 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2015-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2015-2018 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -61,6 +61,7 @@
#include "orte/util/show_help.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ras/base/base.h"
#include "orte/runtime/runtime.h"
#include "orte/runtime/orte_globals.h"
@ -370,6 +371,16 @@ cleanup:
int
orte_session_dir_cleanup(orte_jobid_t jobid)
{
/* special case - if a daemon is colocated with mpirun,
* then we let mpirun do the rest to avoid a race
* condition. this scenario always results in the rank=1
* daemon colocated with mpirun */
if (orte_ras_base.launch_orted_on_hn &&
ORTE_PROC_IS_DAEMON &&
1 == ORTE_PROC_MY_NAME->vpid) {
return ORTE_SUCCESS;
}
if (!orte_create_session_dirs || orte_process_info.rm_session_dirs ) {
/* we haven't created them or RM will clean them up for us*/
return ORTE_SUCCESS;
@ -386,6 +397,7 @@ orte_session_dir_cleanup(orte_jobid_t jobid)
return ORTE_ERR_NOT_INITIALIZED;
}
/* recursively blow the whole session away for our job family,
* saving only output files
*/
@ -461,20 +473,6 @@ orte_session_dir_finalize(orte_process_name_t *proc)
opal_os_dirpath_destroy(orte_process_info.proc_session_dir,
false, orte_dir_check_file);
opal_os_dirpath_destroy(orte_process_info.job_session_dir,
false, orte_dir_check_file);
/* only remove the jobfam session dir if we are the
* local daemon and we are finalizing our own session dir */
if ((ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON) &&
(ORTE_PROC_MY_NAME == proc)) {
opal_os_dirpath_destroy(orte_process_info.jobfam_session_dir,
false, orte_dir_check_file);
}
if( NULL != orte_process_info.top_session_dir ){
opal_os_dirpath_destroy(orte_process_info.top_session_dir,
false, orte_dir_check_file);
}
if (opal_os_dirpath_is_empty(orte_process_info.proc_session_dir)) {
if (orte_debug_flag) {
@ -492,6 +490,32 @@ orte_session_dir_finalize(orte_process_name_t *proc)
}
}
/* special case - if a daemon is colocated with mpirun,
* then we let mpirun do the rest to avoid a race
* condition. this scenario always results in the rank=1
* daemon colocated with mpirun */
if (orte_ras_base.launch_orted_on_hn &&
ORTE_PROC_IS_DAEMON &&
1 == ORTE_PROC_MY_NAME->vpid) {
return ORTE_SUCCESS;
}
opal_os_dirpath_destroy(orte_process_info.job_session_dir,
false, orte_dir_check_file);
/* only remove the jobfam session dir if we are the
* local daemon and we are finalizing our own session dir */
if ((ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON) &&
(ORTE_PROC_MY_NAME == proc)) {
opal_os_dirpath_destroy(orte_process_info.jobfam_session_dir,
false, orte_dir_check_file);
}
if( NULL != orte_process_info.top_session_dir ){
opal_os_dirpath_destroy(orte_process_info.top_session_dir,
false, orte_dir_check_file);
}
if (opal_os_dirpath_is_empty(orte_process_info.job_session_dir)) {
if (orte_debug_flag) {
opal_output(0, "sess_dir_finalize: found job session dir empty - deleting");