Deal with special case during cleanup
In some scenarios, we can have a daemon sharing the node with mpirun. In those cases, we need to avoid race conditions in cleanup Signed-off-by: Ralph Castain <rhc@open-mpi.org>
Этот коммит содержится в:
родитель
a0ea197e97
Коммит
8d1be27a1e
@ -12,7 +12,7 @@
|
||||
* Copyright (c) 2014 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2015 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* Copyright (c) 2015-2017 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2015-2018 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -61,6 +61,7 @@
|
||||
#include "orte/util/show_help.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/ras/base/base.h"
|
||||
#include "orte/runtime/runtime.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
|
||||
@ -370,6 +371,16 @@ cleanup:
|
||||
int
|
||||
orte_session_dir_cleanup(orte_jobid_t jobid)
|
||||
{
|
||||
/* special case - if a daemon is colocated with mpirun,
|
||||
* then we let mpirun do the rest to avoid a race
|
||||
* condition. this scenario always results in the rank=1
|
||||
* daemon colocated with mpirun */
|
||||
if (orte_ras_base.launch_orted_on_hn &&
|
||||
ORTE_PROC_IS_DAEMON &&
|
||||
1 == ORTE_PROC_MY_NAME->vpid) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
if (!orte_create_session_dirs || orte_process_info.rm_session_dirs ) {
|
||||
/* we haven't created them or RM will clean them up for us*/
|
||||
return ORTE_SUCCESS;
|
||||
@ -386,6 +397,7 @@ orte_session_dir_cleanup(orte_jobid_t jobid)
|
||||
return ORTE_ERR_NOT_INITIALIZED;
|
||||
}
|
||||
|
||||
|
||||
/* recursively blow the whole session away for our job family,
|
||||
* saving only output files
|
||||
*/
|
||||
@ -461,20 +473,6 @@ orte_session_dir_finalize(orte_process_name_t *proc)
|
||||
|
||||
opal_os_dirpath_destroy(orte_process_info.proc_session_dir,
|
||||
false, orte_dir_check_file);
|
||||
opal_os_dirpath_destroy(orte_process_info.job_session_dir,
|
||||
false, orte_dir_check_file);
|
||||
/* only remove the jobfam session dir if we are the
|
||||
* local daemon and we are finalizing our own session dir */
|
||||
if ((ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON) &&
|
||||
(ORTE_PROC_MY_NAME == proc)) {
|
||||
opal_os_dirpath_destroy(orte_process_info.jobfam_session_dir,
|
||||
false, orte_dir_check_file);
|
||||
}
|
||||
|
||||
if( NULL != orte_process_info.top_session_dir ){
|
||||
opal_os_dirpath_destroy(orte_process_info.top_session_dir,
|
||||
false, orte_dir_check_file);
|
||||
}
|
||||
|
||||
if (opal_os_dirpath_is_empty(orte_process_info.proc_session_dir)) {
|
||||
if (orte_debug_flag) {
|
||||
@ -492,6 +490,32 @@ orte_session_dir_finalize(orte_process_name_t *proc)
|
||||
}
|
||||
}
|
||||
|
||||
/* special case - if a daemon is colocated with mpirun,
|
||||
* then we let mpirun do the rest to avoid a race
|
||||
* condition. this scenario always results in the rank=1
|
||||
* daemon colocated with mpirun */
|
||||
if (orte_ras_base.launch_orted_on_hn &&
|
||||
ORTE_PROC_IS_DAEMON &&
|
||||
1 == ORTE_PROC_MY_NAME->vpid) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
opal_os_dirpath_destroy(orte_process_info.job_session_dir,
|
||||
false, orte_dir_check_file);
|
||||
|
||||
/* only remove the jobfam session dir if we are the
|
||||
* local daemon and we are finalizing our own session dir */
|
||||
if ((ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON) &&
|
||||
(ORTE_PROC_MY_NAME == proc)) {
|
||||
opal_os_dirpath_destroy(orte_process_info.jobfam_session_dir,
|
||||
false, orte_dir_check_file);
|
||||
}
|
||||
|
||||
if( NULL != orte_process_info.top_session_dir ){
|
||||
opal_os_dirpath_destroy(orte_process_info.top_session_dir,
|
||||
false, orte_dir_check_file);
|
||||
}
|
||||
|
||||
if (opal_os_dirpath_is_empty(orte_process_info.job_session_dir)) {
|
||||
if (orte_debug_flag) {
|
||||
opal_output(0, "sess_dir_finalize: found job session dir empty - deleting");
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user