Makes BERT large with sequence length 128 the default

IgorDzreyev · Sep 18, 2019 · 3209d63 · 3209d63
1 parent 3105d84
commit 3209d63
Show file tree

Hide file tree

Showing 5 changed files with 22 additions and 18 deletions.
diff --git a/demo/BERT/python/BERT_TRT.ipynb b/demo/BERT/python/BERT_TRT.ipynb
@@ -80,7 +80,7 @@
    "source": [
     "### 3.a Paragraph and Queries\n",
     "\n",
-    "The paragraph and the questions can be customized by changing the text below:"
+    "The paragraph and the questions can be customized by changing the text below. Note that when using models with small sequence lengths, you should use a shorter paragraph:"
    ]
   },
   {
@@ -96,7 +96,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "paragraph_text = \"The Apollo program, also known as Project Apollo, was the third United States human spaceflight program carried out by the National Aeronautics and Space Administration (NASA), which accomplished landing the first humans on the Moon from 1969 to 1972. First conceived during Dwight D. Eisenhower's administration as a three-man spacecraft to follow the one-man Project Mercury which put the first Americans in space, Apollo was later dedicated to President John F. Kennedy's national goal of landing a man on the Moon and returning him safely to the Earth by the end of the 1960s, which he proposed in a May 25, 1961, address to Congress. Project Mercury was followed by the two-man Project Gemini. The first manned flight of Apollo was in 1968. Apollo ran from 1961 to 1972, and was supported by the two-man Gemini program which ran concurrently with it from 1962 to 1966. Gemini missions developed some of the space travel techniques that were necessary for the success of the Apollo missions. Apollo used Saturn family rockets as launch vehicles. Apollo/Saturn vehicles were also used for an Apollo Applications Program, which consisted of Skylab, a space station that supported three manned missions in 1973-74, and the Apollo-Soyuz Test Project, a joint Earth orbit mission with the Soviet Union in 1975.\""
+    "paragraph_text = \"The Apollo program, also known as Project Apollo, was the third United States human spaceflight program carried out by the National Aeronautics and Space Administration (NASA), which accomplished landing the first humans on the Moon from 1969 to 1972. First conceived during Dwight D. Eisenhower's administration as a three-man spacecraft to follow the one-man Project Mercury which put the first Americans in space, Apollo was later dedicated to President John F. Kennedy's national goal of landing a man on the Moon and returning him safely to the Earth by the end of the 1960s, which he proposed in a May 25, 1961, address to Congress. Project Mercury was followed by the two-man Project Gemini. The first manned flight of Apollo was in 1968. Apollo ran from 1961 to 1972, and was supported by the two-man Gemini program which ran concurrently with it from 1962 to 1966. Gemini missions developed some of the space travel techniques that were necessary for the success of the Apollo missions. Apollo used Saturn family rockets as launch vehicles. Apollo/Saturn vehicles were also used for an Apollo Applications Program, which consisted of Skylab, a space station that supported three manned missions in 1973-74, and the Apollo-Soyuz Test Project, a joint Earth orbit mission with the Soviet Union in 1975.\"\n",
+    "short_paragraph_text = \"The Apollo program, also known as Project Apollo, was the third United States human spaceflight program carried out by the National Aeronautics and Space Administration (NASA), which accomplished landing the first humans on the Moon from 1969 to 1972. First conceived during Dwight D. Eisenhower's administration as a three-man spacecraft to follow the one-man Project Mercury which put the first Americans in space, Apollo was later dedicated to President John F. Kennedy's national goal of landing a man on the Moon.\""
    ]
   },
   {
@@ -164,8 +165,7 @@
     "import data_processing as dp\n",
     "import tokenization\n",
     "\n",
-    "#Base\n",
-    "tokenizer = tokenization.FullTokenizer(vocab_file=\"/workspace/models/fine-tuned/bert_tf_v2_base_fp16_384_v2/vocab.txt\", do_lower_case=True)\n",
+    "tokenizer = tokenization.FullTokenizer(vocab_file=\"/workspace/models/fine-tuned/bert_tf_v2_large_fp16_128_v2/vocab.txt\", do_lower_case=True)\n",
     "\n",
     "# The maximum number of tokens for the question. Questions longer than this will be truncated to this length.\n",
     "max_query_length = 64\n",
@@ -175,10 +175,10 @@
     "\n",
     "# The maximum total input sequence length after WordPiece tokenization. \n",
     "# Sequences longer than this will be truncated, and sequences shorter \n",
-    "max_seq_length = 384\n",
+    "max_seq_length = 128\n",
     "\n",
-    "# Extract tokecs from the paragraph\n",
-    "doc_tokens = dp.convert_doc_tokens(paragraph_text)\n",
+    "# Extract tokens from the paragraph\n",
+    "doc_tokens = dp.convert_doc_tokens(short_paragraph_text)\n",
     "\n",
     "# Extract features from the paragraph and question\n",
     "features = dp.convert_examples_to_features(doc_tokens, question_text, tokenizer, max_seq_length, doc_stride, max_query_length)\n"
@@ -225,8 +225,8 @@
     "import numpy as np\n",
     "import time\n",
     "\n",
-    "# Load the Base BERT Engine\n",
-    "with open(\"/workspace/TensorRT/demo/BERT/bert_base_384.engine\", \"rb\") as f, \\\n",
+    "# Load the BERT-Large Engine\n",
+    "with open(\"/workspace/TensorRT/demo/BERT/bert_large_128.engine\", \"rb\") as f, \\\n",
     "    trt.Runtime(TRT_LOGGER) as runtime, \\\n",
     "    runtime.deserialize_cuda_engine(f.read()) as engine, \\\n",
     "    engine.create_execution_context() as context:\n",
@@ -267,6 +267,7 @@
     "\n",
     "    eval_time_elapsed = time.time() - eval_start_time\n",
     "    \n",
+    "  \n",
     "    print(\"-----------------------------\")\n",
     "    print(\"Running Inference in {:.3f} Sentences/Sec\".format(1.0/eval_time_elapsed))\n",
     "    print(\"-----------------------------\")\n",

diff --git a/demo/BERT/python/README.md b/demo/BERT/python/README.md
@@ -20,17 +20,17 @@
 ## Building an Engine
 To build an engine, run the `bert_builder.py` script. For example,
 ```
-python python/bert_builder.py -m /workspace/models/fine-tuned/bert_tf_v2_base_fp16_384_v2/model.ckpt-8144 -o bert_base_384.engine -b 1 -s 384 -c /workspace/models/fine-tuned/bert_tf_v2_base_fp16_384_v2
+python python/bert_builder.py -m /workspace/models/fine-tuned/bert_tf_v2_large_fp16_128_v2/model.ckpt-8144 -o bert_large_128.engine -b 1 -s 128 -c /workspace/models/fine-tuned/bert_tf_v2_large_fp16_128_v2
 ```
-This will build and engine with a maximum batch size of 1 (`-b 1`), and sequence length of 384 (`-s 384`) using the `bert_config.json` file located in `workspace/models/fine-tuned/bert_tf_v2_base_fp16_384_v2`
+This will build and engine with a maximum batch size of 1 (`-b 1`), and sequence length of 128 (`-s 128`) using the `bert_config.json` file located in `workspace/models/fine-tuned/bert_tf_v2_large_fp16_128_v2`
 
 ## Running Inference
 
 ### Using the Python Script
 You can run inference with the engine generated from the previous step using the `bert_inference.py` script.
 This script accepts a passage and a question. For example,
 ```
-python python/bert_inference.py -e bert_base_384.engine -p "TensorRT is a high performance deep learning inference platform that delivers low latency and high throughput for apps such as recommenders, speech and image/video on NVIDIA GPUs. It includes parsers to import models, and plugins to support novel ops and layers before applying optimizations for inference. Today NVIDIA is open sourcing parsers and plugins in TensorRT so that the deep learning community can customize and extend these components to take advantage of powerful TensorRT optimizations for your apps." -q "What is TensorRT?" -v /workspace/models/fine-tuned/bert_tf_v2_base_fp16_384_v2/vocab.txt
+python python/bert_inference.py -e bert_large_128.engine -s 128 -p "TensorRT is a high performance deep learning inference platform that delivers low latency and high throughput for apps such as recommenders, speech and image/video on NVIDIA GPUs. It includes parsers to import models, and plugins to support novel ops and layers before applying optimizations for inference. Today NVIDIA is open sourcing parsers and plugins in TensorRT so that the deep learning community can customize and extend these components to take advantage of powerful TensorRT optimizations for your apps." -q "What is TensorRT?" -v /workspace/models/fine-tuned/bert_tf_v2_large_fp16_128_v2/vocab.txt
 ```
 
 ### Using the Jupyter Notebook

diff --git a/demo/BERT/python/bert_builder.py b/demo/BERT/python/bert_builder.py
@@ -382,9 +382,9 @@ def set_profile_shape(profile, batch_size):
     parser = argparse.ArgumentParser(description='TensorRT BERT Sample', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     parser.add_argument('-m', '--model', required=True,
                         help='The checkpoint file basename, e.g.: basename(model.ckpt-766908.data-00000-of-00001) is model.ckpt-766908')
-    parser.add_argument('-o', '--output', required=True, default="bert_base_384.engine", help='The bert engine file, ex bert.engine')
+    parser.add_argument('-o', '--output', required=True, default="bert_large_128.engine", help='The bert engine file, ex bert.engine')
     parser.add_argument('-b', '--batchsize', default=1, help='Batch size')
-    parser.add_argument('-s', '--sequence', default=384, help='Sequence length of the BERT model')
+    parser.add_argument('-s', '--sequence-length', default=384, help='Sequence length of the BERT model')
     parser.add_argument('-c', '--config', required=True,
                         help='The folder containing the bert_config.json, which can be downloaded e.g. from https://github.com/google-research/bert#pre-trained-models or by running download_models.py in dle/TensorFlow/LanguageModeling/BERT/data/pretrained_models_google')
 
@@ -393,6 +393,6 @@ def set_profile_shape(profile, batch_size):
     inputbase = opt.model
     outputbase = opt.output
     B = int(opt.batchsize)
-    S = int(opt.sequence)
+    S = int(opt.sequence_length)
     bert_path = opt.config
     main(inputbase, B, S, bert_path, outputbase)
diff --git a/demo/BERT/python/bert_inference.py b/demo/BERT/python/bert_inference.py
@@ -47,6 +47,9 @@ def parse_args():
     parser.add_argument('-v', '--vocab-file',
             help='Path to file containing entire understandable vocab',
             default='./pre-trained_model/uncased_L-24_H-1024_A-16/vocab.txt')
+    parser.add_argument('-s', '--sequence-length',
+            help='The sequence length to use. Defaults to 128',
+            default=128, type=int)
     args, _ = parser.parse_known_args()
     return args
 
@@ -77,7 +80,7 @@ def parse_args():
     doc_stride = 128
     # The maximum total input sequence length after WordPiece tokenization.
     # Sequences longer than this will be truncated, and sequences shorter
-    max_seq_length = 384
+    max_seq_length = args.sequence_length
     # Extract tokecs from the paragraph
     doc_tokens = dp.convert_doc_tokens(paragraph_text)
 

diff --git a/demo/BERT/python/build_examples.sh b/demo/BERT/python/build_examples.sh
@@ -16,9 +16,9 @@
 
 
 # Setup default parameters (if no command-line parameters given)
-MODEL='base'
+MODEL='large'
 FT_PRECISION='fp16'
-SEQ_LEN='384'
+SEQ_LEN='128'
 
 SCRIPT=$(readlink -f "$0")
 SCRIPT_DIR=$(dirname ${SCRIPT})