Use unique collective ids for the checkpoint/restart code
authorAdrian Reber <adrian.reber@hs-esslingen.de>
Mon, 3 Feb 2014 19:35:18 +0000 (20:35 +0100)
committerAdrian Reber <adrian.reber@hs-esslingen.de>
Mon, 3 Feb 2014 19:40:17 +0000 (20:40 +0100)
orte/mca/ess/env/ess_env_module.c
orte/mca/odls/base/odls_base_default_fns.c
orte/mca/plm/base/plm_base_launch_support.c
orte/mca/snapc/full/snapc_full_app.c
orte/orted/orted_main.c
orte/runtime/orte_globals.h
orte/util/proc_info.c
orte/util/proc_info.h

index 9b80099..b04f902 100644 (file)
@@ -277,7 +277,7 @@ static int rte_ft_event(int state)
     orte_grpcomm_collective_t coll;
 
     OBJ_CONSTRUCT(&coll, orte_grpcomm_collective_t);
-    coll.id = orte_process_info.peer_init_barrier;
+    coll.id = orte_process_info.snapc_init_barrier;
 
     /******** Checkpoint Prep ********/
     if(OPAL_CRS_CHECKPOINT == state) {
index cdc2874..a9db7e3 100644 (file)
@@ -596,7 +596,21 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
     nm->name.jobid = jdata->jobid;
     nm->name.vpid = ORTE_VPID_WILDCARD;
     opal_list_append(&coll->participants, &nm->super);
-    
+
+#if OPAL_ENABLE_FT_CR == 1
+    coll = orte_grpcomm_base_setup_collective(jdata->snapc_init_barrier);
+    nm = OBJ_NEW(orte_namelist_t);
+    nm->name.jobid = jdata->jobid;
+    nm->name.vpid = ORTE_VPID_WILDCARD;
+    opal_list_append(&coll->participants, &nm->super);
+
+    coll = orte_grpcomm_base_setup_collective(jdata->snapc_fini_barrier);
+    nm = OBJ_NEW(orte_namelist_t);
+    nm->name.jobid = jdata->jobid;
+    nm->name.vpid = ORTE_VPID_WILDCARD;
+    opal_list_append(&coll->participants, &nm->super);
+#endif
+
     /* progress any pending collectives */
     orte_grpcomm_base_progress_collectives();
     
index 702f7c4..0e90a9a 100644 (file)
@@ -238,6 +238,10 @@ void orte_plm_base_setup_job(int fd, short args, void *cbdata)
     char *modx_par, *modx_val;
     char *bar1_par, *bar1_val;
     char *bar2_par, *bar2_val;
+#if OPAL_ENABLE_FT_CR == 1
+    char *barcr1_par, *barcr1_val;
+    char *barcr2_par, *barcr2_val;
+#endif
 
     OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
                          "%s plm:base:setup_job",
@@ -283,6 +287,16 @@ void orte_plm_base_setup_job(int fd, short args, void *cbdata)
     (void) mca_base_var_env_name ("orte_peer_fini_barrier_id", &bar2_par);
     asprintf(&bar2_val, "%d", caddy->jdata->peer_fini_barrier);
 
+#if OPAL_ENABLE_FT_CR == 1
+    caddy->jdata->snapc_init_barrier = orte_grpcomm_base_get_coll_id();
+    printf("snapc_init_barrier %d\n", caddy->jdata->snapc_init_barrier);
+    (void) mca_base_var_env_name("orte_snapc_init_barrier_id", &barcr1_par);
+    asprintf(&barcr1_val, "%d", caddy->jdata->snapc_init_barrier);
+    caddy->jdata->snapc_fini_barrier = orte_grpcomm_base_get_coll_id();
+    (void) mca_base_var_env_name("orte_snapc_fini_barrier_id", &barcr2_par);
+    asprintf(&barcr2_val, "%d", caddy->jdata->snapc_fini_barrier);
+#endif
+
     /* if app recovery is not defined, set apps to defaults */
     for (i=0; i < caddy->jdata->apps->size; i++) {
         if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(caddy->jdata->apps, i))) {
@@ -295,6 +309,10 @@ void orte_plm_base_setup_job(int fd, short args, void *cbdata)
         opal_setenv(modx_par, modx_val, true, &app->env);
         opal_setenv(bar1_par, bar1_val, true, &app->env);
         opal_setenv(bar2_par, bar2_val, true, &app->env);
+#if OPAL_ENABLE_FT_CR == 1
+        opal_setenv(barcr1_par, barcr1_val, true, &app->env);
+        opal_setenv(barcr2_par, barcr2_val, true, &app->env);
+#endif
     }
     free(modx_par);
     free(modx_val);
@@ -302,6 +320,12 @@ void orte_plm_base_setup_job(int fd, short args, void *cbdata)
     free(bar1_val);
     free(bar2_par);
     free(bar2_val);
+#if OPAL_ENABLE_FT_CR == 1
+    free(barcr1_par);
+    free(barcr1_val);
+    free(barcr2_par);
+    free(barcr2_val);
+#endif
 
     /* set the job state to the next position */
     ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_INIT_COMPLETE);
index 68bde03..dc83987 100644 (file)
@@ -155,7 +155,7 @@ int app_coord_init()
     }
 
     coll = OBJ_NEW(orte_grpcomm_collective_t);
-    coll->id = orte_process_info.peer_init_barrier;
+    coll->id = orte_process_info.snapc_init_barrier;
     if( ORTE_SUCCESS != (ret = orte_grpcomm.barrier(coll)) ) {
            ORTE_ERROR_LOG(ret);
         exit_status = ret;
@@ -231,7 +231,7 @@ int app_coord_finalize()
     }
 
     coll = OBJ_NEW(orte_grpcomm_collective_t);
-    coll->id = orte_process_info.peer_init_barrier;
+    coll->id = orte_process_info.snapc_init_barrier;
     if( ORTE_SUCCESS != (ret = orte_grpcomm.barrier(coll)) ) {
         ORTE_ERROR_LOG(ret);
         exit_status = ret;
@@ -309,7 +309,7 @@ int app_coord_finalize()
                              "app) Shutdown Barrier: Waiting on barrier...!"));
     }
 
-    coll->id = orte_process_info.peer_fini_barrier;
+    coll->id = orte_process_info.snapc_fini_barrier;
     if( ORTE_SUCCESS != (ret = orte_grpcomm.barrier(coll)) ) {
         ORTE_ERROR_LOG(ret);
         exit_status = ret;
index 285d70f..9ad0f8d 100644 (file)
@@ -606,6 +606,22 @@ int orte_daemon(int argc, char *argv[])
         nm->name.vpid = ORTE_VPID_WILDCARD;
         opal_list_append(&coll->participants, &nm->super);
 
+#if OPAL_ENABLE_FT_CR == 1
+        jdata->snapc_init_barrier = orte_grpcomm_base_get_coll_id();
+        coll = orte_grpcomm_base_setup_collective(jdata->snapc_init_barrier);
+        nm = OBJ_NEW(orte_namelist_t);
+        nm->name.jobid = jdata->jobid;
+        nm->name.vpid = ORTE_VPID_WILDCARD;
+        opal_list_append(&coll->participants, &nm->super);
+
+        jdata->snapc_fini_barrier = orte_grpcomm_base_get_coll_id();
+        coll = orte_grpcomm_base_setup_collective(jdata->snapc_fini_barrier);
+        nm = OBJ_NEW(orte_namelist_t);
+        nm->name.jobid = jdata->jobid;
+        nm->name.vpid = ORTE_VPID_WILDCARD;
+        opal_list_append(&coll->participants, &nm->super);
+#endif
+
         /* need to setup a pidmap for it */
         if (ORTE_SUCCESS != (ret = orte_util_encode_pidmap(&orte_pidmap, false))) {
             ORTE_ERROR_LOG(ret);
index 1f2cd18..679c3f8 100644 (file)
@@ -462,6 +462,9 @@ typedef struct {
     char *ckpt_snapshot_ref;
     /* snapshot location */
     char *ckpt_snapshot_loc;
+    /* collective ids */
+    orte_grpcomm_coll_id_t snapc_init_barrier;
+    orte_grpcomm_coll_id_t snapc_fini_barrier;
 #endif
 } orte_job_t;
 ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_job_t);
index 54fe4d6..dc83d21 100644 (file)
@@ -83,6 +83,10 @@ ORTE_DECLSPEC orte_proc_info_t orte_process_info = {
     /*  .peer_init_barrier =            */   -1,
     /*  .peer_fini_barrier =            */   -1,
     /*  .my_hostid =                    */   ORTE_VPID_INVALID
+#if OPAL_ENABLE_FT_CR == 1
+    /*  .snapc_init_barrier =           */   -1,
+    /*  .snapc_fini_barrier =           */   -1,
+#endif
 };
 
 static bool init=false;
@@ -90,6 +94,10 @@ static int orte_ess_node_rank;
 static int orte_peer_modex_id;
 static int orte_peer_init_barrier_id;
 static int orte_peer_fini_barrier_id;
+#if OPAL_ENABLE_FT_CR == 1
+static int orte_snapc_init_barrier_id;
+static int orte_snapc_fini_barrier_id;
+#endif
 static char *orte_strip_prefix;
 
 int orte_proc_info(void)
@@ -286,6 +294,26 @@ int orte_proc_info(void)
                                   &orte_peer_fini_barrier_id);
     orte_process_info.peer_fini_barrier = (orte_grpcomm_coll_id_t) orte_peer_fini_barrier_id;
 
+#if OPAL_ENABLE_FT_CR == 1
+    orte_snapc_init_barrier_id = -1;
+    (void) mca_base_var_register ("orte", "orte", NULL, "snapc_init_barrier_id", "SNAPC init barrier collective id",
+                                  MCA_BASE_VAR_TYPE_INT, NULL, 0,
+                                  MCA_BASE_VAR_FLAG_INTERNAL,
+                                  OPAL_INFO_LVL_9,
+                                  MCA_BASE_VAR_SCOPE_CONSTANT,
+                                  &orte_snapc_init_barrier_id);
+    orte_process_info.snapc_init_barrier = (orte_grpcomm_coll_id_t) orte_snapc_init_barrier_id;
+
+    orte_snapc_fini_barrier_id = -1;
+    (void) mca_base_var_register ("orte", "orte", NULL, "snapc_fini_barrier_id", "SNAPC finalize barrier collective id",
+                                  MCA_BASE_VAR_TYPE_INT, NULL, 0,
+                                  MCA_BASE_VAR_FLAG_INTERNAL,
+                                  OPAL_INFO_LVL_9,
+                                  MCA_BASE_VAR_SCOPE_CONSTANT,
+                                  &orte_snapc_fini_barrier_id);
+    orte_process_info.snapc_fini_barrier = (orte_grpcomm_coll_id_t) orte_snapc_fini_barrier_id;
+#endif
+
     return ORTE_SUCCESS;
 }
 
index 765ced8..8795a1f 100644 (file)
@@ -130,6 +130,10 @@ struct orte_proc_info_t {
     orte_grpcomm_coll_id_t peer_init_barrier;   /**< barrier id during init */
     orte_grpcomm_coll_id_t peer_fini_barrier;   /**< barrier id during finalize */
     orte_vpid_t my_hostid;               /** identifies the local host for a coprocessor */
+#if OPAL_ENABLE_FT_CR == 1
+    orte_grpcomm_coll_id_t snapc_init_barrier;  /**< barrier id during init */
+    orte_grpcomm_coll_id_t snapc_fini_barrier;  /**< barrier id during finalize */
+#endif
 };
 typedef struct orte_proc_info_t orte_proc_info_t;