Supporting evaluation for multi gpu training case (NVIDIA#1250)

* Supporting evaluation for multi gpu training case
dragomirradev · Oct 3, 2020 · 7042e41 · 7042e41
1 parent da4bf19
commit 7042e41
Showing 1 changed file with 21 additions and 10 deletions.
diff --git a/examples/nlp/intent_slot_classification/intent_slot_classification.py b/examples/nlp/intent_slot_classification/intent_slot_classification.py
@@ -38,25 +38,36 @@ def main(cfg: DictConfig) -> None:
         model.save_to(cfg.model.nemo_path)
         logging.info(f'The model is saved into the `.nemo` file: {cfg.model.nemo_path}')
 
-    # after model training is done, if you have saved the checkpoints, you can create the model from
-    # the checkpoint again and evaluate it on a data file.
+    # after model training is done, you can load the model from the saved checkpoint
+    # and evaluate it on a data file.
     logging.info("================================================================================================")
     logging.info("Starting the testing of the trained model on test set...")
     logging.info("We will load the latest model saved checkpoint from the training...")
 
-    # retrieve the path to the last checkpoint of the training
-    # the latest checkpoint would be used, change to 'best.ckpt' to use the best one instead
+    # retrieve the path to the last checkpoint of the training (you can use the best checkpoint instead)
     checkpoint_path = os.path.join(
         trainer.checkpoint_callback.dirpath, trainer.checkpoint_callback.prefix + "end.ckpt"
     )
-    eval_model = IntentSlotClassificationModel.load_from_checkpoint(checkpoint_path=checkpoint_path)
 
-    # we will setup testing data reusing the same config (test section)
-    eval_model.setup_test_data(test_data_config=cfg.model.test_ds)
+    if os.path.exists(checkpoint_path):
+        eval_model = IntentSlotClassificationModel.load_from_checkpoint(checkpoint_path=checkpoint_path)
 
-    trainer.test(model=model, ckpt_path=None, verbose=False)
-    logging.info("Testing finished!")
-    logging.info("================================================================================================")
+        # we will setup testing data reusing the same config (test section)
+        eval_model.setup_test_data(test_data_config=cfg.model.test_ds)
+
+        # we will reinitialize the trainer with a single GPU and no distributed backend for the evaluation
+        # we need to call the next line also to overcome an issue with a trainer reinitialization in PT
+        os.environ.pop('PL_TRAINER_GPUS')
+        cfg.trainer.gpus = 1 if cfg.trainer.gpus != 0 else 0
+        cfg.trainer.distributed_backend = None
+        eval_trainer = pl.Trainer(**cfg.trainer)
+
+        eval_trainer.test(model=eval_model, ckpt_path=None, verbose=False)
+
+        logging.info("Testing finished!")
+        logging.info(
+            "================================================================================================"
+        )
 
 
 if __name__ == '__main__':