In r15007 the --bootproxy orted argument was removed to support daemon reuse.
The SnapC Full local Coordinator used this argument to attach to the job the daemon would be launching. So once this option was removed C/R support broke. This commit has the local coordinator attach to the job just before it is launched by the ODLS module. This is a much cleaner solution, and will eventually allow the SnapC modules to attach to multiple jobs launched on a single machine. This commit fixes the C/R regression introduced in r15007. This commit was SVN r15121. The following SVN revision numbers were found above: r15007 --> open-mpi/ompi@85df3bd92f
Этот коммит содержится в:
родитель
5719182a4e
Коммит
edb2cbd150
@ -92,6 +92,9 @@
|
||||
#include "orte/mca/smr/smr.h"
|
||||
#include "orte/mca/filem/filem.h"
|
||||
#include "orte/mca/filem/base/base.h"
|
||||
#if OPAL_ENABLE_FT == 1
|
||||
#include "orte/mca/snapc/snapc.h"
|
||||
#endif
|
||||
|
||||
#include "orte/mca/odls/base/odls_private.h"
|
||||
#include "orte/mca/odls/default/odls_default.h"
|
||||
@ -963,7 +966,7 @@ int orte_odls_default_launch_local_procs(orte_gpr_notify_data_t *data, char **ba
|
||||
}
|
||||
|
||||
opal_output(orte_odls_globals.output, "odls: setting up launch for job %ld", (long)job);
|
||||
|
||||
|
||||
/* We need to create a list of the app_contexts
|
||||
* so we can know what to launch - the process info only gives
|
||||
* us an index into the app_context array, not the app_context
|
||||
@ -1156,6 +1159,16 @@ int orte_odls_default_launch_local_procs(orte_gpr_notify_data_t *data, char **ba
|
||||
free(uri_file);
|
||||
free(my_uri);
|
||||
|
||||
#if OPAL_ENABLE_FT == 1
|
||||
/*
|
||||
* Notify the local SnapC component regarding new job
|
||||
*/
|
||||
if( ORTE_SUCCESS != (rc = orte_snapc.setup_job(job) ) ) {
|
||||
/* Silent Failure :/ JJH */
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Now we preload any files that are needed. This is done on a per
|
||||
* app context basis */
|
||||
for (item = opal_list_get_first(&app_context_list);
|
||||
|
@ -76,6 +76,8 @@ extern "C" {
|
||||
*/
|
||||
int local_coord_init(void);
|
||||
int local_coord_finalize(void);
|
||||
int local_coord_setup_job(orte_jobid_t jobid);
|
||||
int local_coord_release_job(orte_jobid_t jobid);
|
||||
|
||||
/*
|
||||
* Application Coordinator Functionality
|
||||
|
@ -74,16 +74,31 @@ static orte_jobid_t snapc_local_jobid;
|
||||
* Function Definitions
|
||||
************************/
|
||||
int local_coord_init( void )
|
||||
{
|
||||
snapc_local_jobid = -1;
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int local_coord_finalize( void )
|
||||
{
|
||||
if( snapc_local_jobid >= 0 ) {
|
||||
return local_coord_release_job(snapc_local_jobid);
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int local_coord_setup_job(orte_jobid_t jobid)
|
||||
{
|
||||
int ret, exit_status = ORTE_SUCCESS;
|
||||
int id, jobid;
|
||||
|
||||
/*
|
||||
* Get the jobid that we are responsible for
|
||||
* Set the jobid that we are responsible for
|
||||
*/
|
||||
id = mca_base_param_register_int("rmgr","bootproxy","jobid",NULL,0);
|
||||
mca_base_param_lookup_int(id,&jobid);
|
||||
snapc_local_jobid = jobid;
|
||||
opal_output_verbose(10, mca_snapc_full_component.super.output_handle,
|
||||
"local) Monitor local jobid (%d)\n", snapc_local_jobid);
|
||||
|
||||
/*
|
||||
* Get the list of vpid's that we care about
|
||||
@ -118,13 +133,12 @@ int local_coord_init( void )
|
||||
}
|
||||
|
||||
ret = exit_status;
|
||||
goto cleanup;
|
||||
|
||||
cleanup:
|
||||
return exit_status;
|
||||
}
|
||||
|
||||
int local_coord_finalize( void )
|
||||
int local_coord_release_job(orte_jobid_t jobid)
|
||||
{
|
||||
int ret, exit_status = ORTE_SUCCESS;
|
||||
opal_list_item_t* item = NULL;
|
||||
@ -161,9 +175,7 @@ int local_coord_finalize( void )
|
||||
OBJ_DESTRUCT(&snapc_local_vpids);
|
||||
|
||||
ret = exit_status;
|
||||
goto cleanup;
|
||||
|
||||
cleanup:
|
||||
return exit_status;
|
||||
}
|
||||
|
||||
|
@ -196,6 +196,11 @@ int orte_snapc_full_setup_job(orte_jobid_t jobid) {
|
||||
exit_status = ret;
|
||||
}
|
||||
}
|
||||
else if( ORTE_SNAPC_LOCAL_COORD_TYPE == mca_snapc_full_component.super.coord_type ) {
|
||||
if(ORTE_SUCCESS != (ret = local_coord_setup_job(jobid) ) ) {
|
||||
exit_status = ret;
|
||||
}
|
||||
}
|
||||
|
||||
return exit_status;
|
||||
}
|
||||
@ -208,6 +213,11 @@ int orte_snapc_full_release_job(orte_jobid_t jobid) {
|
||||
exit_status = ret;
|
||||
}
|
||||
}
|
||||
else if( ORTE_SNAPC_LOCAL_COORD_TYPE == mca_snapc_full_component.super.coord_type ) {
|
||||
if(ORTE_SUCCESS != (ret = local_coord_release_job(jobid) ) ) {
|
||||
exit_status = ret;
|
||||
}
|
||||
}
|
||||
|
||||
return exit_status;
|
||||
}
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user