Skip to content

Commit

Permalink
Makes BERT large with sequence length 128 the default
Browse files Browse the repository at this point in the history
  • Loading branch information
pranavm-nvidia committed Sep 18, 2019
1 parent 3105d84 commit 3209d63
Show file tree
Hide file tree
Showing 5 changed files with 22 additions and 18 deletions.
19 changes: 10 additions & 9 deletions demo/BERT/python/BERT_TRT.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@
"source": [
"### 3.a Paragraph and Queries\n",
"\n",
"The paragraph and the questions can be customized by changing the text below:"
"The paragraph and the questions can be customized by changing the text below. Note that when using models with small sequence lengths, you should use a shorter paragraph:"
]
},
{
Expand All @@ -96,7 +96,8 @@
"metadata": {},
"outputs": [],
"source": [
"paragraph_text = \"The Apollo program, also known as Project Apollo, was the third United States human spaceflight program carried out by the National Aeronautics and Space Administration (NASA), which accomplished landing the first humans on the Moon from 1969 to 1972. First conceived during Dwight D. Eisenhower's administration as a three-man spacecraft to follow the one-man Project Mercury which put the first Americans in space, Apollo was later dedicated to President John F. Kennedy's national goal of landing a man on the Moon and returning him safely to the Earth by the end of the 1960s, which he proposed in a May 25, 1961, address to Congress. Project Mercury was followed by the two-man Project Gemini. The first manned flight of Apollo was in 1968. Apollo ran from 1961 to 1972, and was supported by the two-man Gemini program which ran concurrently with it from 1962 to 1966. Gemini missions developed some of the space travel techniques that were necessary for the success of the Apollo missions. Apollo used Saturn family rockets as launch vehicles. Apollo/Saturn vehicles were also used for an Apollo Applications Program, which consisted of Skylab, a space station that supported three manned missions in 1973-74, and the Apollo-Soyuz Test Project, a joint Earth orbit mission with the Soviet Union in 1975.\""
"paragraph_text = \"The Apollo program, also known as Project Apollo, was the third United States human spaceflight program carried out by the National Aeronautics and Space Administration (NASA), which accomplished landing the first humans on the Moon from 1969 to 1972. First conceived during Dwight D. Eisenhower's administration as a three-man spacecraft to follow the one-man Project Mercury which put the first Americans in space, Apollo was later dedicated to President John F. Kennedy's national goal of landing a man on the Moon and returning him safely to the Earth by the end of the 1960s, which he proposed in a May 25, 1961, address to Congress. Project Mercury was followed by the two-man Project Gemini. The first manned flight of Apollo was in 1968. Apollo ran from 1961 to 1972, and was supported by the two-man Gemini program which ran concurrently with it from 1962 to 1966. Gemini missions developed some of the space travel techniques that were necessary for the success of the Apollo missions. Apollo used Saturn family rockets as launch vehicles. Apollo/Saturn vehicles were also used for an Apollo Applications Program, which consisted of Skylab, a space station that supported three manned missions in 1973-74, and the Apollo-Soyuz Test Project, a joint Earth orbit mission with the Soviet Union in 1975.\"\n",
"short_paragraph_text = \"The Apollo program, also known as Project Apollo, was the third United States human spaceflight program carried out by the National Aeronautics and Space Administration (NASA), which accomplished landing the first humans on the Moon from 1969 to 1972. First conceived during Dwight D. Eisenhower's administration as a three-man spacecraft to follow the one-man Project Mercury which put the first Americans in space, Apollo was later dedicated to President John F. Kennedy's national goal of landing a man on the Moon.\""
]
},
{
Expand Down Expand Up @@ -164,8 +165,7 @@
"import data_processing as dp\n",
"import tokenization\n",
"\n",
"#Base\n",
"tokenizer = tokenization.FullTokenizer(vocab_file=\"/workspace/models/fine-tuned/bert_tf_v2_base_fp16_384_v2/vocab.txt\", do_lower_case=True)\n",
"tokenizer = tokenization.FullTokenizer(vocab_file=\"/workspace/models/fine-tuned/bert_tf_v2_large_fp16_128_v2/vocab.txt\", do_lower_case=True)\n",
"\n",
"# The maximum number of tokens for the question. Questions longer than this will be truncated to this length.\n",
"max_query_length = 64\n",
Expand All @@ -175,10 +175,10 @@
"\n",
"# The maximum total input sequence length after WordPiece tokenization. \n",
"# Sequences longer than this will be truncated, and sequences shorter \n",
"max_seq_length = 384\n",
"max_seq_length = 128\n",
"\n",
"# Extract tokecs from the paragraph\n",
"doc_tokens = dp.convert_doc_tokens(paragraph_text)\n",
"# Extract tokens from the paragraph\n",
"doc_tokens = dp.convert_doc_tokens(short_paragraph_text)\n",
"\n",
"# Extract features from the paragraph and question\n",
"features = dp.convert_examples_to_features(doc_tokens, question_text, tokenizer, max_seq_length, doc_stride, max_query_length)\n"
Expand Down Expand Up @@ -225,8 +225,8 @@
"import numpy as np\n",
"import time\n",
"\n",
"# Load the Base BERT Engine\n",
"with open(\"/workspace/TensorRT/demo/BERT/bert_base_384.engine\", \"rb\") as f, \\\n",
"# Load the BERT-Large Engine\n",
"with open(\"/workspace/TensorRT/demo/BERT/bert_large_128.engine\", \"rb\") as f, \\\n",
" trt.Runtime(TRT_LOGGER) as runtime, \\\n",
" runtime.deserialize_cuda_engine(f.read()) as engine, \\\n",
" engine.create_execution_context() as context:\n",
Expand Down Expand Up @@ -267,6 +267,7 @@
"\n",
" eval_time_elapsed = time.time() - eval_start_time\n",
" \n",
" \n",
" print(\"-----------------------------\")\n",
" print(\"Running Inference in {:.3f} Sentences/Sec\".format(1.0/eval_time_elapsed))\n",
" print(\"-----------------------------\")\n",
Expand Down
6 changes: 3 additions & 3 deletions demo/BERT/python/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,17 +20,17 @@
## Building an Engine
To build an engine, run the `bert_builder.py` script. For example,
```
python python/bert_builder.py -m /workspace/models/fine-tuned/bert_tf_v2_base_fp16_384_v2/model.ckpt-8144 -o bert_base_384.engine -b 1 -s 384 -c /workspace/models/fine-tuned/bert_tf_v2_base_fp16_384_v2
python python/bert_builder.py -m /workspace/models/fine-tuned/bert_tf_v2_large_fp16_128_v2/model.ckpt-8144 -o bert_large_128.engine -b 1 -s 128 -c /workspace/models/fine-tuned/bert_tf_v2_large_fp16_128_v2
```
This will build and engine with a maximum batch size of 1 (`-b 1`), and sequence length of 384 (`-s 384`) using the `bert_config.json` file located in `workspace/models/fine-tuned/bert_tf_v2_base_fp16_384_v2`
This will build and engine with a maximum batch size of 1 (`-b 1`), and sequence length of 128 (`-s 128`) using the `bert_config.json` file located in `workspace/models/fine-tuned/bert_tf_v2_large_fp16_128_v2`
## Running Inference
### Using the Python Script
You can run inference with the engine generated from the previous step using the `bert_inference.py` script.
This script accepts a passage and a question. For example,
```
python python/bert_inference.py -e bert_base_384.engine -p "TensorRT is a high performance deep learning inference platform that delivers low latency and high throughput for apps such as recommenders, speech and image/video on NVIDIA GPUs. It includes parsers to import models, and plugins to support novel ops and layers before applying optimizations for inference. Today NVIDIA is open sourcing parsers and plugins in TensorRT so that the deep learning community can customize and extend these components to take advantage of powerful TensorRT optimizations for your apps." -q "What is TensorRT?" -v /workspace/models/fine-tuned/bert_tf_v2_base_fp16_384_v2/vocab.txt
python python/bert_inference.py -e bert_large_128.engine -s 128 -p "TensorRT is a high performance deep learning inference platform that delivers low latency and high throughput for apps such as recommenders, speech and image/video on NVIDIA GPUs. It includes parsers to import models, and plugins to support novel ops and layers before applying optimizations for inference. Today NVIDIA is open sourcing parsers and plugins in TensorRT so that the deep learning community can customize and extend these components to take advantage of powerful TensorRT optimizations for your apps." -q "What is TensorRT?" -v /workspace/models/fine-tuned/bert_tf_v2_large_fp16_128_v2/vocab.txt
```
### Using the Jupyter Notebook
Expand Down
6 changes: 3 additions & 3 deletions demo/BERT/python/bert_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -382,9 +382,9 @@ def set_profile_shape(profile, batch_size):
parser = argparse.ArgumentParser(description='TensorRT BERT Sample', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-m', '--model', required=True,
help='The checkpoint file basename, e.g.: basename(model.ckpt-766908.data-00000-of-00001) is model.ckpt-766908')
parser.add_argument('-o', '--output', required=True, default="bert_base_384.engine", help='The bert engine file, ex bert.engine')
parser.add_argument('-o', '--output', required=True, default="bert_large_128.engine", help='The bert engine file, ex bert.engine')
parser.add_argument('-b', '--batchsize', default=1, help='Batch size')
parser.add_argument('-s', '--sequence', default=384, help='Sequence length of the BERT model')
parser.add_argument('-s', '--sequence-length', default=384, help='Sequence length of the BERT model')
parser.add_argument('-c', '--config', required=True,
help='The folder containing the bert_config.json, which can be downloaded e.g. from https://github.com/google-research/bert#pre-trained-models or by running download_models.py in dle/TensorFlow/LanguageModeling/BERT/data/pretrained_models_google')

Expand All @@ -393,6 +393,6 @@ def set_profile_shape(profile, batch_size):
inputbase = opt.model
outputbase = opt.output
B = int(opt.batchsize)
S = int(opt.sequence)
S = int(opt.sequence_length)
bert_path = opt.config
main(inputbase, B, S, bert_path, outputbase)
5 changes: 4 additions & 1 deletion demo/BERT/python/bert_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ def parse_args():
parser.add_argument('-v', '--vocab-file',
help='Path to file containing entire understandable vocab',
default='./pre-trained_model/uncased_L-24_H-1024_A-16/vocab.txt')
parser.add_argument('-s', '--sequence-length',
help='The sequence length to use. Defaults to 128',
default=128, type=int)
args, _ = parser.parse_known_args()
return args

Expand Down Expand Up @@ -77,7 +80,7 @@ def parse_args():
doc_stride = 128
# The maximum total input sequence length after WordPiece tokenization.
# Sequences longer than this will be truncated, and sequences shorter
max_seq_length = 384
max_seq_length = args.sequence_length
# Extract tokecs from the paragraph
doc_tokens = dp.convert_doc_tokens(paragraph_text)

Expand Down
4 changes: 2 additions & 2 deletions demo/BERT/python/build_examples.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@


# Setup default parameters (if no command-line parameters given)
MODEL='base'
MODEL='large'
FT_PRECISION='fp16'
SEQ_LEN='384'
SEQ_LEN='128'

SCRIPT=$(readlink -f "$0")
SCRIPT_DIR=$(dirname ${SCRIPT})
Expand Down

0 comments on commit 3209d63

Please sign in to comment.