diff --git a/.gitmodules b/.gitmodules
index 46fc6586..3fd45b59 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,7 +1,7 @@
 [submodule "parsers/onnx"]
 	path = parsers/onnx
 	url = https://github.com/onnx/onnx-tensorrt.git
-	branch = 5.1
+	branch = 6.0
 [submodule "third_party/protobuf"]
 	path = third_party/protobuf
 	url = https://github.com/protocolbuffers/protobuf.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7f2b8d11..ec7ca17d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -21,7 +21,7 @@ include(cmake/modules/find_library_create_target.cmake)
 set_ifndef(TRT_LIB_DIR ${CMAKE_BINARY_DIR})
 set_ifndef(TRT_BIN_DIR ${CMAKE_BINARY_DIR})
 
-file(STRINGS "${CMAKE_CURRENT_SOURCE_DIR}/include/NvInfer.h" VERSION_STRINGS REGEX "#define NV_TENSORRT_.*")
+file(STRINGS "${CMAKE_CURRENT_SOURCE_DIR}/include/NvInferVersion.h" VERSION_STRINGS REGEX "#define NV_TENSORRT_.*")
 
 foreach(TYPE MAJOR MINOR PATCH BUILD)
     string(REGEX MATCH "NV_TENSORRT_${TYPE} [0-9]" TRT_TYPE_STRING ${VERSION_STRINGS})
@@ -37,15 +37,14 @@ set(TRT_VERSION "${TRT_MAJOR}.${TRT_MINOR}.${TRT_PATCH}.${TRT_BUILD}" CACHE STRI
 set(TRT_SOVERSION "${TRT_SO_MAJOR}.${TRT_SO_MINOR}.${TRT_SO_PATCH}" CACHE STRING "TRT library so version")
 message("Building for TensorRT version: ${TRT_VERSION}, library version: ${TRT_SOVERSION}")
 
-set(FIND_CUDA "")
 if(NOT DEFINED CMAKE_TOOLCHAIN_FILE)
     find_program(CMAKE_CXX_COMPILER NAMES $ENV{CXX} g++)
-    set(FIND_CUDA "CUDA")
 endif()
 
-message("CHECK for ${FIND_CUDA}")
+set(CMAKE_SKIP_BUILD_RPATH True)
+
 project(TensorRT
-        LANGUAGES CXX ${FIND_CUDA}
+        LANGUAGES CXX CUDA
         VERSION ${TRT_VERSION}
         DESCRIPTION "TensorRT is a C++ library that facilitates high performance inference on NVIDIA GPUs and deep learning accelerators."
         HOMEPAGE_URL "https://github.com/NVIDIA/TensorRT")
@@ -78,6 +77,8 @@ endif()
 
 set(CMAKE_CXX_FLAGS "-Wno-deprecated-declarations ${CMAKE_CXX_FLAGS} -DBUILD_SYSTEM=cmake_oss")
 
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -Wno-deprecated-declarations")
+
 ################################### DEPENDENCIES ##########################################
 set(DEFAULT_CUDA_VERSION 10.1)
 set(DEFAULT_CUDNN_VERSION 7.5)
@@ -151,15 +152,18 @@ else()
         set(CUB_ROOT_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/cub CACHE STRING "directory of CUB installation")
     endif()
 
-    find_package(CUDA ${CUDA_VERSION} REQUIRED)
+    ## find_package(CUDA) is broken for cross-compilation. Enable CUDA language instead.
+    if(NOT DEFINED CMAKE_TOOLCHAIN_FILE)
+        find_package(CUDA ${CUDA_VERSION} REQUIRED)
+    endif()
 
     include_directories(
         ${CUDA_INCLUDE_DIRS}
     )
     find_library(CUDNN_LIB cudnn HINTS
-        ${CUDA_TOOLKIT_ROOT_DIR}/lib64 ${CUDNN_ROOT_DIR}/lib64)
+        ${CUDA_TOOLKIT_ROOT_DIR} ${CUDNN_ROOT_DIR} PATH_SUFFIXES lib64 lib)
     find_library(CUBLAS_LIB cublas HINTS
-        ${CUDA_TOOLKIT_ROOT_DIR}/lib64)
+        ${CUDA_TOOLKIT_ROOT_DIR} PATH_SUFFIXES lib64 lib lib/stubs)
 
     if(BUILD_PARSERS)
         configure_protobuf(${PROTOBUF_VERSION})
@@ -173,8 +177,10 @@ if (NOT (NVINTERNAL OR NVPARTNER))
     find_library_create_target(nvuffparser nvparsers SHARED ${TRT_LIB_DIR})
 endif()
 
-find_library(CUDART_LIB cudart HINTS ${CUDA_TOOLKIT_ROOT_DIR}/lib64)
+find_library(CUDART_LIB cudart HINTS ${CUDA_TOOLKIT_ROOT_DIR} PATH_SUFFIXES lib lib64)
 find_library(RT_LIB rt)
+
+set(CUDA_LIBRARIES ${CUDART_LIB})
 ############################################################################################
 # TensorRT
 
diff --git a/README.md b/README.md
index e502d1c7..8a13f270 100644
--- a/README.md
+++ b/README.md
@@ -15,9 +15,9 @@ To build the TensorRT OSS components, ensure you meet the following package requ
 
 * [CUDA](https://developer.nvidia.com/cuda-toolkit)
   * Recommended versions:
-  * [cuda-10.1](https://developer.nvidia.com/cuda-10.1-download-archive-base) + cuDNN-7.5
-  * [cuda-10.0](https://developer.nvidia.com/cuda-10.0-download-archive) + cuDNN-7.5
-  * [cuda-9.0](https://developer.nvidia.com/cuda-90-download-archive) + cuDNN 7.3
+  * [cuda-10.1](https://developer.nvidia.com/cuda-10.1-download-archive-base) + cuDNN-7.6
+  * [cuda-10.0](https://developer.nvidia.com/cuda-10.0-download-archive) + cuDNN-7.6
+  * [cuda-9.0](https://developer.nvidia.com/cuda-90-download-archive) + cuDNN 7.6
 
 * [GNU Make](https://ftp.gnu.org/gnu/make/) >= v4.1
 
@@ -45,12 +45,11 @@ To build the TensorRT OSS components, ensure you meet the following package requ
 
 **TensorRT Release**
 
-* [TensorRT](https://developer.nvidia.com/nvidia-tensorrt-5x-download) v5.1.5
-
+* [TensorRT](https://developer.nvidia.com/nvidia-tensorrt-download) v6.0.1
 
 NOTE: Along with the TensorRT OSS components, the following source packages will also be downloaded, and they are not required to be installed on the system.
 
-- [ONNX-TensorRT](https://github.com/onnx/onnx-tensorrt) v5.1
+- [ONNX-TensorRT](https://github.com/onnx/onnx-tensorrt) v6.0
 - [CUB](http://nvlabs.github.io/cub/) v1.8.0
 - [Protobuf](https://github.com/protocolbuffers/protobuf.git) v3.8.x
 
@@ -60,7 +59,7 @@ NOTE: Along with the TensorRT OSS components, the following source packages will
 1. #### Download TensorRT OSS sources.
 
 	```bash
-	git clone -b release/5.1 https://github.com/nvidia/TensorRT TensorRT
+	git clone -b master https://github.com/nvidia/TensorRT TensorRT
 	cd TensorRT
 	git submodule update --init --recursive
 	export TRT_SOURCE=`pwd`
@@ -68,26 +67,26 @@ NOTE: Along with the TensorRT OSS components, the following source packages will
 
 2. #### Download the TensorRT binary release.
 
-	To build the TensorRT OSS, obtain the corresponding TensorRT 5.1.5 binary release from [NVidia Developer Zone](https://developer.nvidia.com/nvidia-tensorrt-5x-download). For a list of key features, known and fixed issues, see the [TensorRT 5.1.5 Release Notes](https://docs.nvidia.com/deeplearning/sdk/tensorrt-release-notes/tensorrt-5.html#rel_5-1-5).
+	To build the TensorRT OSS, obtain the corresponding TensorRT 6.0.1 binary release from [NVidia Developer Zone](https://developer.nvidia.com/nvidia-tensorrt-download). For a list of key features, known and fixed issues, see the [TensorRT 6.0.1 Release Notes](https://docs.nvidia.com/deeplearning/sdk/tensorrt-release-notes/index.html).
 
 	**Example: Ubuntu 18.04 with cuda-10.1**
 
-	Download and extract the *TensorRT 5.1.5.0 GA for Ubuntu 18.04 and CUDA 10.1 tar package*
+	Download and extract the *TensorRT 6.0.1.5 GA for Ubuntu 18.04 and CUDA 10.1 tar package*
 	```bash
 	cd ~/Downloads
-	# Download TensorRT-5.1.5.0.Ubuntu-18.04.2.x86_64-gnu.cuda-10.1.cudnn7.5.tar.gz
-	tar -xvzf TensorRT-5.1.5.0.Ubuntu-18.04.2.x86_64-gnu.cuda-10.1.cudnn7.5.tar.gz
-	export TRT_RELEASE=`pwd`/TensorRT-5.1.5.0
+	# Download TensorRT-6.0.1.5.Ubuntu-18.04.2.x86_64-gnu.cuda-10.1.cudnn7.6.tar.gz
+	tar -xvzf TensorRT-6.0.1.5.Ubuntu-18.04.2.x86_64-gnu.cuda-10.1.cudnn7.6.tar.gz
+	export TRT_RELEASE=`pwd`/TensorRT-6.0.1.5
 	```
 
 	**Example: CentOS/RedHat 7 with cuda-9.0**
 
-	Download and extract the *TensorRT 5.1.5.0 GA for CentOS/RedHat 7 and CUDA 9.0 tar package*
+	Download and extract the *TensorRT 6.0.1.5 GA for CentOS/RedHat 7 and CUDA 9.0 tar package*
 	```bash
 	cd ~/Downloads
-	# Download TensorRT-5.1.5.0.Red-Hat.x86_64-gnu.cuda-9.0.cudnn7.5.tar.gz
-	tar -xvzf TensorRT-5.1.5.0.Red-Hat.x86_64-gnu.cuda-9.0.cudnn7.5.tar.gz
-	export TRT_RELEASE=~/Downloads/TensorRT-5.1.5.0
+	# Download TensorRT-6.0.1.5.Red-Hat.x86_64-gnu.cuda-9.0.cudnn7.6.tar.gz
+	tar -xvzf TensorRT-6.0.1.5.Red-Hat.x86_64-gnu.cuda-9.0.cudnn7.6.tar.gz
+	export TRT_RELEASE=~/Downloads/TensorRT-6.0.1.5
 	```
 
 ## Setting Up The Build Environment
@@ -134,20 +133,9 @@ NOTE: Along with the TensorRT OSS components, the following source packages will
 
 	> NOTE:
 	> 1. The default CUDA version used by CMake is 10.1. To override this, for example to 9.0, append `-DCUDA_VERSION=9.0` to the cmake command.
-	> 2. If linking against the plugin and parser libraries obtained from TensorRT release (default behavior) is causing compatibility issues with TensorRT OSS, try building the OSS components separately in the following dependency order:
+	> 2. Samples may fail to link on CentOS7. To work around this create the following symbolic link:
 	> ```bash
-	> # 1. Build Plugins
-	> cmake .. -DTRT_LIB_DIR=$TRT_RELEASE/lib -DTRT_BIN_DIR=`pwd`/out \
-	>          -DBUILD_PLUGINS=ON -DBUILD_PARSERS=OFF -DBUILD_SAMPLES=OFF
-	> make -j$(nproc)
-	> # 2. Build Parsers
-	> cmake .. -DTRT_LIB_DIR=$TRT_RELEASE/lib -DTRT_BIN_DIR=`pwd`/out \
-	>          -DBUILD_PLUGINS=OFF -DBUILD_PARSERS=ON -DBUILD_SAMPLES=OFF
-	> make -j$(nproc)
-	> # 3. Build Samples
-	> cmake .. -DTRT_LIB_DIR=$TRT_RELEASE/lib -DTRT_BIN_DIR=`pwd`/out \
-	>          -DBUILD_PLUGINS=OFF -DBUILD_PARSERS=OFF -DBUILD_SAMPLES=ON
-	> make -j$(nproc)
+	> ln -s $TRT_BIN_DIR/libnvinfer_plugin.so $TRT_BIN_DIR/libnvinfer_plugin.so.6
 	> ```
 
 	The required CMake arguments are:
@@ -176,6 +164,10 @@ NOTE: Along with the TensorRT OSS components, the following source packages will
 
 	Other build options with limited applicability:
 
+	- `NVINTERNAL`: Used by TensorRT team for internal builds. Values consists of [`OFF`] | `ON`.
+
+	- `PROTOBUF_INTERNAL_VERSION`: The version of protobuf to use, for example [`10.0`].  Only applicable if `NVINTERNAL` is also enabled.
+
 	- `NVPARTNER`: For use by NVIDIA partners with exclusive source access.  Values consists of [`OFF`] | `ON`.
 
 	- `CUB_VERSION`: The version of CUB to use, for example [`1.8.0`].
@@ -191,6 +183,7 @@ NOTE: Along with the TensorRT OSS components, the following source packages will
 * Copy the build artifacts into the TensorRT installation directory, updating the installation.
   * TensorRT installation directory is determined as `$TRT_LIB_DIR/..`
   * Installation might require superuser privileges depending on the path and permissions of files being replaced.
+  * Installation is not supported in cross compilation scenario. Please copy the result files from `build/out` folder into the target device.
 
 	```bash
 	sudo make install
@@ -208,6 +201,5 @@ NOTE: Along with the TensorRT OSS components, the following source packages will
 
 ## Known Issues
 
-#### TensorRT 5.1.5
-* FP16/INT8 modes have been disabled in SampleSSD (Caffe version). Please see the [SampleSSD README](samples/opensource/sampleSSD/README.md#known-issues) for details.
-* Additionally, see the TensorRT [Release Notes](https://docs.nvidia.com/deeplearning/sdk/tensorrt-release-notes/tensorrt-5.html#rel_5-1-5).
+#### TensorRT 6.0.1
+* See [Release Notes](https://docs.nvidia.com/deeplearning/sdk/tensorrt-release-notes/index.html).
diff --git a/VERSION b/VERSION
index 1b177239..8cd21902 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-5.1.5.0
+6.0.1.5
diff --git a/cmake/toolchains/cmake_aarch64.toolchain b/cmake/toolchains/cmake_aarch64.toolchain
index 499eb542..5262506e 100644
--- a/cmake/toolchains/cmake_aarch64.toolchain
+++ b/cmake/toolchains/cmake_aarch64.toolchain
@@ -19,11 +19,11 @@ set(CMAKE_SYSTEM_PROCESSOR aarch64)
 set(TRT_PLATFORM_ID "aarch64")
 set(CUDA_PLATFORM_ID "aarch64-linux")
 
-set(CMAKE_C_COMPILER $ENV{AARCH64_CC})
-set(CMAKE_CXX_COMPILER $ENV{AARCH64_CC})
+set(CMAKE_C_COMPILER /usr/bin/aarch64-linux-gnu-gcc)
+set(CMAKE_CXX_COMPILER /usr/bin/aarch64-linux-gnu-g++)
 
-set(CMAKE_C_FLAGS "$ENV{AARCH64_CFLAGS}" CACHE STRING "" FORCE)
-set(CMAKE_CXX_FLAGS "$ENV{AARCH64_CFLAGS}" CACHE STRING "" FORCE)
+set(CMAKE_C_FLAGS "" CACHE STRING "" FORCE)
+set(CMAKE_CXX_FLAGS "" CACHE STRING "" FORCE)
 
 set(CMAKE_C_COMPILER_TARGET aarch64)
 set(CMAKE_CXX_COMPILER_TARGET aarch64)
@@ -35,14 +35,16 @@ if(NVINTERNAL)
     set(EXT_PATH ${PROJECT_SOURCE_DIR}/../externals)
     set(CUDA_ROOT ${EXT_PATH}/cuda-${CUDA_VERSION}-${TRT_PLATFORM_ID}/${CUDA_PLATFORM_ID})
 else()
-    set(CUDA_ROOT /usr/local/cuda-${CUDA_VERSION}/targets/${CUDA_PLATFORM_ID})
+    set(CUDA_ROOT /usr/local/cuda-${CUDA_VERSION}/targets/${CUDA_PLATFORM_ID} CACHE STRING "CUDA ROOT dir")
 endif()
 
 set(CUDA_TOOLKIT_ROOT_DIR ${CUDA_ROOT})
 set(CUDA_INCLUDE_DIRS ${CUDA_ROOT}/include)
 
+set(RT_LIB /usr/aarch64-linux-gnu/lib/librt.so)
+
 set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER} CACHE STRING "" FORCE)
-set(CMAKE_CUDA_FLAGS "-I${CUDA_INCLUDE_DIRS} -Xcompiler=\"-fPIC ${CMAKE_CXX_FLAGS}\"" CACHE STRING "" FORCE)
+set(CMAKE_CUDA_FLAGS "-cudart none -I${CUDA_INCLUDE_DIRS} -Xcompiler=\"-fPIC ${CMAKE_CXX_FLAGS}\"" CACHE STRING "" FORCE)
 set(CMAKE_CUDA_COMPILER_FORCED TRUE)
 
 if(DEFINED ENV{VULCAN} AND NOT $ENV{VULCAN} STREQUAL "")
diff --git a/demo/BERT/CMakeLists.txt b/demo/BERT/CMakeLists.txt
index 60b29746..5c60fefa 100644
--- a/demo/BERT/CMakeLists.txt
+++ b/demo/BERT/CMakeLists.txt
@@ -20,62 +20,72 @@ set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}  \
 --expt-extended-lambda \
 -gencode arch=compute_70,code=sm_70 \
 -gencode arch=compute_75,code=sm_75 \
--O3")
+-Wno-deprecated-declarations")
 
-set(BERT_LIBS 
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-deprecated-declarations")
+
+set(BERT_LIBS
     cudart
     cublas
     nvinfer
+    nvinfer_plugin
     pthread
     z
-    )
+)
 
 include_directories(
-    ../../include
-    ../../samples/common
-    /usr/local/cuda-10.1/targets/x86_64-linux/include
     ./
-    ./plugins
+    ./bert
     ./layers
+    ./plugins
     ./util
+    ../../include/
+    ../../samples/common/
     ../../third_party/cub/
+    /usr/include/x86_64-linux-gnu
+    /usr/local/cuda-10.1/targets/x86_64-linux/include
+    /workspace/tensorrt/include
+    /workspace/tensorrt/samples/common
     /workspace/cub/
     /workspace/cutlass/
-    )
+)
 
 link_directories(
+    /usr/lib/x86_64-linux-gnu
     /usr/local/cuda-10.1/targets/x86_64-linux/lib
-    /tensorrt/lib
-    )
+    /workspace/tensorrt/lib
+)
 
 add_library(common SHARED
     ../../samples/common/logger.cpp
-    util/dataUtils.cpp
-    )
+)
 
 add_library(bert_plugins SHARED
+    plugins/embLayerNormPlugin.cu
     plugins/geluPlugin.cu
-    plugins/skipLayerNormPlugin.cu
     plugins/qkvToContextPlugin.cu
-    plugins/embLayerNormPlugin.cu
-    )
+    plugins/skipLayerNormPlugin.cu
+)
 
-target_link_libraries(bert_plugins 
+target_link_libraries(bert_plugins
+    common
     ${BERT_LIBS}
-    )
+)
 
-target_link_libraries(common 
+target_link_libraries(common
     ${BERT_LIBS}
-    )
+)
 
 add_executable(sample_bert
+    bert/bert.cpp
+    bert/driver.cpp
+    util/dataUtils.cpp
     sampleBERT.cpp
-    )
+)
 
 target_compile_features(sample_bert PUBLIC cxx_std_11)
 
-target_link_libraries(sample_bert 
+target_link_libraries(sample_bert
     common
     bert_plugins
-    )
-
+)
diff --git a/demo/BERT/Dockerfile b/demo/BERT/Dockerfile
index eb11f26c..bc7aaa15 100644
--- a/demo/BERT/Dockerfile
+++ b/demo/BERT/Dockerfile
@@ -12,18 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-FROM nvcr.io/nvidia/tensorrt:19.05-py3
+FROM nvcr.io/nvidia/tensorrt:19.09-py3
 ARG myuid
 ARG mygid
 
-RUN  echo $myuid
-RUN  echo $mygid
+RUN echo $myuid
+RUN echo $mygid
 
-# TODO: Depending on the docker version, this might work without mapping the user for home dir access
 RUN groupadd -r -g ${mygid} nb && useradd -r -u ${myuid} -g ${mygid} -ms /bin/bash nb
 
 RUN apt-get update && apt-get install -y software-properties-common && add-apt-repository ppa:ubuntu-toolchain-r/test
-RUN apt-get update && apt-get install -y pbzip2 pv bzip2  sudo gcc-7 g++-7  zlib1g-dev g++-4.9
+RUN apt-get update && apt-get install -y pbzip2 pv bzip2 sudo gcc-7 g++-7 zlib1g-dev g++-4.8
 RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-7 60 \
                                  --slave /usr/bin/g++ g++ /usr/bin/g++-7  && \
                                  update-alternatives --config gcc
@@ -31,6 +30,7 @@ RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-7 60 \
 RUN wget https://cmake.org/files/v3.14/cmake-3.14.0-Linux-x86_64.sh && \
     sh cmake-3.14.0-Linux-x86_64.sh --prefix=/usr/local --exclude-subdir
 RUN pip install tensorflow==1.13.1 && pip install horovod
+RUN pip install jupyter
 
 RUN echo 'nb:abc123' | chpasswd
 
@@ -42,4 +42,3 @@ WORKDIR /workspace
 
 RUN git clone https://github.com/NVlabs/cub.git
 RUN git clone https://github.com/NVIDIA/cutlass.git
-
diff --git a/demo/BERT/README.md b/demo/BERT/README.md
index dc86cf14..e56f44e6 100644
--- a/demo/BERT/README.md
+++ b/demo/BERT/README.md
@@ -30,6 +30,10 @@ To build the TensorRT OSS components, ensure you meet the following package requ
 
 * [PIP](https://pypi.org/project/pip/#history) >= v19.0
 
+* [TensorFlow](https://www.tensorflow.org/install)
+
+* [Horovod](https://github.com/horovod/horovod#install)
+
 * Essential libraries and utilities
   * [Git](https://git-scm.com/downloads), [pkg-config](https://www.freedesktop.org/wiki/Software/pkg-config/), [Wget](https://www.gnu.org/software/wget/faq.html#download), [Zlib](https://zlib.net/)
 
@@ -43,7 +47,7 @@ To build the TensorRT OSS components, ensure you meet the following package requ
 
 **TensorRT Release**
 
-* [TensorRT](https://developer.nvidia.com/nvidia-tensorrt-5x-download) v5.1.5
+* [TensorRT](https://developer.nvidia.com/nvidia-tensorrt-6x-download) v6.0.0
 
 
 ## Example Workflow
@@ -84,7 +88,7 @@ The SQuAD fine-tuned Tensorflow checkpoint can be converted using the following
 python helpers/convert_weights.py -m $CHECKPOINT -o <weight path>/filename
 ```
 
-This will generate a file `<weight path>/<filename>.weights`. The path that contains the weights file, will be referred to as `WEIGHT_PATH`.
+This will generate a file `<weight path>/bert.weights`. The path that contains the weights file, will be referred to as `WEIGHT_PATH`.
 
 
 ### 3. Generate an input/output pair
diff --git a/demo/BERT/bert/bert.cpp b/demo/BERT/bert/bert.cpp
new file mode 100644
index 00000000..8a9d0210
--- /dev/null
+++ b/demo/BERT/bert/bert.cpp
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "bert.h"
+
+#include "bertEncoder.h"
+#include "bertUtils.h"
+#include "dataUtils.h"
+#include "embLayerNormPlugin.h"
+#include "squad.h"
+#include <common.h>
+#include <iostream>
+#include <numeric>
+
+using namespace nvinfer1;
+
+namespace bert
+{
+
+BERTDriver::BERTDriver(const int numHeads, const bool useFp16,
+    const size_t maxWorkspaceSize, const OptProfiles& optProfiles)
+    : DynamicDriver(useFp16, maxWorkspaceSize, optProfiles)
+    , mNumHeads(numHeads)
+{
+}
+
+void BERTDriver::buildNetwork(nvinfer1::INetworkDefinition* network, const HostTensorMap& params)
+{
+    WeightMap weightMap;
+    for (auto kv : params)
+    {
+        const HostTensor& t = *kv.second;
+        weightMap[kv.first] = Weights{t.mType, t.mData, (int64_t) t.mSize};
+    }
+    int intermediateSize = 0;
+    int numHiddenLayers = 0;
+    int hiddenSize = 0;
+
+    gLogVerbose << "Inferring Network size" << endl;
+    inferNetworkSizes(weightMap, hiddenSize, intermediateSize, numHiddenLayers);
+
+    assert(intermediateSize);
+    assert(hiddenSize);
+    assert(numHiddenLayers);
+    gLogVerbose << intermediateSize << endl;
+    gLogVerbose << hiddenSize << endl;
+    gLogVerbose << numHiddenLayers << endl;
+
+    // create the model to populate the network, then set the outputs and create an engine
+    ITensor* inputIds = network->addInput(kMODEL_INPUT0_NAME, DataType::kINT32, Dims{2, -1, -1});
+    ITensor* segmentIds = network->addInput(kMODEL_INPUT1_NAME, DataType::kINT32, Dims{2, -1, -1});
+    ITensor* inputMask = network->addInput(kMODEL_INPUT2_NAME, DataType::kINT32, Dims{2, -1, -1});
+
+    const Weights& wBeta = weightMap.at("bert_embeddings_layernorm_beta");
+    const Weights& wGamma = weightMap.at("bert_embeddings_layernorm_gamma");
+    const Weights& wWordEmb = weightMap.at("bert_embeddings_word_embeddings");
+    const Weights& wTokEmb = weightMap.at("bert_embeddings_token_type_embeddings");
+    const Weights& wPosEmb = weightMap.at("bert_embeddings_position_embeddings");
+
+    gLogVerbose << "embeddings params read" << endl;
+
+    ITensor* inputs[3] = {inputIds, segmentIds, inputMask};
+
+    auto embPlugin = test::EmbLayerNormPluginDynamic("embeddings", mUseFp16, wBeta, wGamma, wWordEmb, wPosEmb, wTokEmb);
+    IPluginV2Layer* embLayer = network->addPluginV2(inputs, 3, embPlugin);
+    embLayer->setName("EmbeddingsLayer");
+    setOutputName(embLayer, "embeddings_", "output");
+
+    ITensor* embeddings = embLayer->getOutput(0);
+    ITensor* maskIdx = embLayer->getOutput(1);
+    auto dims = embeddings->getDimensions();
+    gLogVerbose << "emb out dims: " << dims << endl;
+
+    /// BERT Encoder
+
+    const BertConfig config(mNumHeads, hiddenSize, intermediateSize, numHiddenLayers, mUseFp16);
+
+    ILayer* bertLayer = bertModelDynamic(config, weightMap, network, embeddings, maskIdx);
+
+    /// SQuAD Output Layer
+
+    ILayer* squadLayer = squadDynamic("cls_", config, weightMap, network, bertLayer->getOutput(0));
+
+    network->markOutput(*squadLayer->getOutput(0));
+}
+}
diff --git a/demo/BERT/bert/bert.h b/demo/BERT/bert/bert.h
new file mode 100644
index 00000000..3b4f87b6
--- /dev/null
+++ b/demo/BERT/bert/bert.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TRT_BERT_H
+#define TRT_BERT_H
+
+#include "driver.h"
+#include <NvInfer.h>
+#include <string>
+
+constexpr const char* kMODEL_INPUT0_NAME = "input_ids";
+constexpr const char* kMODEL_INPUT1_NAME = "segment_ids";
+constexpr const char* kMODEL_INPUT2_NAME = "input_mask";
+
+namespace bert
+{
+
+struct BERTDriver : DynamicDriver
+{
+    const int mNumHeads;
+
+    BERTDriver(const int nbHeads, const bool useFp16, const size_t maxWorkspaceSize, const OptProfiles& optProfiles);
+
+    BERTDriver(const std::string& enginePath);
+
+    void buildNetwork(nvinfer1::INetworkDefinition* network, const HostTensorMap& params) override;
+};
+}
+
+#endif // TRT_BERT_H
diff --git a/demo/BERT/bert/driver.cpp b/demo/BERT/bert/driver.cpp
new file mode 100644
index 00000000..043ee6e5
--- /dev/null
+++ b/demo/BERT/bert/driver.cpp
@@ -0,0 +1,325 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "driver.h"
+#include "cuda_profiler_api.h"
+#include <iostream>
+
+using namespace nvinfer1;
+using namespace samplesCommon;
+
+namespace bert
+{
+
+HostTensor::HostTensor(void* data, const DataType type, const vector<size_t>& shape)
+    : mShape(shape)
+    , mData(data)
+    , mType(type)
+{
+    mSize = accumulate(shape.begin(), shape.end(), 1, multiplies<size_t>());
+    mNbBytes = mSize * samplesCommon::getElementSize(type);
+}
+
+Driver::Driver(const int maxBatchSize, const bool useFp16, const size_t maxWorkspaceSize)
+    : mMaxBatchSize(maxBatchSize)
+    , mMaxWorkspaceSize(maxWorkspaceSize)
+    , mUseFp16(useFp16)
+{
+}
+
+NetworkDefinitionCreationFlags Driver::getNetworkFlags() const
+{
+    return 1 << static_cast<size_t>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
+}
+
+IBuilderConfig* Driver::getBuilderConfig() const
+{
+    IBuilderConfig* config = mBuilder->createBuilderConfig();
+    config->setMaxWorkspaceSize(mMaxWorkspaceSize);
+    if (mUseFp16)
+    {
+        config->setFlag(BuilderFlag::kFP16);
+    }
+    return config;
+}
+
+void Driver::allocateBindings()
+{
+    // Static sizes with implicit batch size: allocation sizes known to engine
+    for (int i = 0; i < mEngine->getNbBindings(); i++)
+    {
+        size_t vol = samplesCommon::volume(mEngine->getBindingDimensions(i));
+        size_t elementSize = samplesCommon::getElementSize(mEngine->getBindingDataType(i));
+        size_t allocationSize = static_cast<size_t>(mMaxBatchSize) * vol * elementSize;
+        mDeviceBuffers.emplace_back(DeviceBuffer(allocationSize, mEngine->getBindingDataType(i)));
+        mBuffers.emplace_back(mDeviceBuffers.back().data());
+    }
+}
+
+void Driver::init(const HostTensorMap& params)
+{
+    mBuilder = createInferBuilder(gLogger.getTRTLogger());
+
+    const NetworkDefinitionCreationFlags flags = getNetworkFlags();
+    INetworkDefinition* network{mBuilder->createNetworkV2(flags)};
+
+    buildNetwork(network, params);
+    assert(network);
+
+    IBuilderConfig* config = getBuilderConfig();
+    gLogInfo << "Building Engine..." << endl;
+    // Build the engine
+    mEngine = (mBuilder->buildEngineWithConfig(*network, *config));
+    gLogInfo << "Done building engine." << endl;
+
+    assert(mEngine);
+    mContext = (mEngine->createExecutionContext());
+    assert(mContext);
+
+    allocateBindings();
+}
+
+void Driver::buildNetwork(INetworkDefinition* network, const HostTensorMap& params)
+{
+    auto inputTensor = network->addInput("input", DataType::kFLOAT, Dims3{768, 1, 1});
+    auto W_ = *params.at("l0_attention_self_query_kernel");
+    auto B_ = *params.at("l0_attention_self_query_bias");
+    Weights weights{DataType::kFLOAT, W_.mData, static_cast<int64_t>(W_.mSize)};
+    Weights bias{DataType::kFLOAT, nullptr, 0};
+
+    auto fc = network->addFullyConnected(*inputTensor, 768, weights, bias);
+    fc->getOutput(0)->setName("output");
+    network->markOutput(*fc->getOutput(0));
+}
+
+void Driver::h2d(const HostTensorMap& hostBuffers, cudaStream_t stream)
+{
+    for (auto& kv : hostBuffers)
+    {
+        const int idx = mEngine->getBindingIndex(kv.first.c_str());
+        assert(idx >= 0);
+        assert(mEngine->getBindingDataType(idx) == kv.second->mType);
+        const size_t len = kv.second->mNbBytes;
+        CHECK(cudaMemcpyAsync(mBuffers[idx], kv.second->mData, len, cudaMemcpyHostToDevice, stream));
+        gLogVerbose << "Binding: " << kv.first << ", idx: " << idx << ", uploading " << len << " bytes" << std::endl;
+    }
+}
+
+void Driver::d2h(HostTensorMap& hostBuffers, cudaStream_t stream)
+{
+    for (auto& kv : hostBuffers)
+    {
+        const int idx = mEngine->getBindingIndex(kv.first.c_str());
+        assert(idx >= 0);
+        assert(mEngine->getBindingDataType(idx) == kv.second->mType);
+        const size_t len = kv.second->mNbBytes;
+        CHECK(cudaMemcpyAsync(kv.second->mData, mBuffers[idx], len, cudaMemcpyDeviceToHost, stream));
+        gLogVerbose << "Binding: " << kv.first << ", idx: " << idx << ", downloading " << len << " bytes" << std::endl;
+    }
+}
+
+void Driver::benchmark(const HostTensorMap& inCfg, HostTensorMap& outCfg, const int batchSize, cudaStream_t stream,
+    vector<float>& timesTotal, vector<float>& timesCompute, const bool withMemcpy)
+{
+    const int numRuns = timesTotal.size();
+    assert(numRuns == timesCompute.size());
+    assert(numRuns > 0);
+
+    void** bs = mBuffers.data();
+
+    vector<cudaEvent_t> startsTotal(numRuns);
+    vector<cudaEvent_t> stopsTotal(numRuns);
+    vector<cudaEvent_t> startsCompute(numRuns);
+    vector<cudaEvent_t> stopsCompute(numRuns);
+
+    for (int it = 0; it < numRuns; it++)
+    {
+        cudaEventCreate(&startsTotal[it]);
+        cudaEventCreate(&stopsTotal[it]);
+        cudaEventCreate(&startsCompute[it]);
+        cudaEventCreate(&stopsCompute[it]);
+    }
+
+    cudaProfilerStart();
+    if (withMemcpy)
+    {
+        for (int it = 0; it < numRuns; it++)
+        {
+            CHECK(cudaEventRecord(startsTotal[it], stream));
+            h2d(inCfg, stream);
+            CHECK(cudaEventRecord(startsCompute[it], stream));
+            infer(batchSize, stream);
+            CHECK(cudaEventRecord(stopsCompute[it], stream));
+            d2h(outCfg, stream);
+            CHECK(cudaEventRecord(stopsTotal[it], stream));
+        }
+    }
+    else
+    {
+        for (int it = 0; it < numRuns; it++)
+        {
+            CHECK(cudaEventRecord(startsCompute[it], stream));
+            infer(batchSize, stream);
+            CHECK(cudaEventRecord(stopsCompute[it], stream));
+        }
+    }
+    CHECK(cudaDeviceSynchronize());
+
+    cudaProfilerStop();
+    float msCompute = 0;
+    float msTotal = 0;
+    for (int it = 0; it < numRuns; it++)
+    {
+        cudaEventElapsedTime(&msCompute, startsCompute[it], stopsCompute[it]);
+        timesCompute[it] = msCompute;
+
+        msTotal = msCompute;
+        if (withMemcpy)
+        {
+            cudaEventElapsedTime(&msTotal, startsTotal[it], stopsTotal[it]);
+        }
+        timesTotal[it] = msTotal;
+
+        cudaEventDestroy(startsTotal[it]);
+        cudaEventDestroy(stopsTotal[it]);
+        cudaEventDestroy(startsCompute[it]);
+        cudaEventDestroy(stopsCompute[it]);
+
+        gLogInfo << "Run " << it << "; Total: " << timesTotal[it] << "ms Comp.only: " << timesCompute[it] << "ms" << std::endl;
+    }
+}
+
+void Driver::infer(const int batchSize, cudaStream_t stream)
+{
+    mContext->enqueueV2(mBuffers.data(), stream, nullptr);
+}
+
+void Driver::infer(const HostTensorMap& inCfg, HostTensorMap& outCfg, const int batchSize, cudaStream_t stream)
+{
+    h2d(inCfg, stream);
+    infer(batchSize, stream);
+    d2h(outCfg, stream);
+}
+
+Driver::~Driver()
+{
+    for (auto b : mBuffers)
+    {
+        CHECK(cudaFree(b));
+    }
+}
+
+void Driver::serializeEngine(const std::string& enginePath) const
+{
+    ofstream engineFile(enginePath, ios::binary);
+    if (!engineFile)
+    {
+        gLogError << "Cannot open engine file: " << enginePath << endl;
+    }
+
+    IHostMemory* serializedEngine{mContext->getEngine().serialize()};
+    if (serializedEngine == nullptr)
+    {
+        gLogError << "Engine serialization failed" << endl;
+    }
+
+    engineFile.write(static_cast<char*>(serializedEngine->data()), serializedEngine->size());
+    serializedEngine->destroy();
+}
+
+Driver::Driver(const std::string& enginePath)
+    : mMaxBatchSize(-1)
+    , mMaxWorkspaceSize(-1)
+    , mUseFp16(false)
+{
+    ifstream input(enginePath, ios::binary);
+    if (!input)
+    {
+        gLogError << "Invalid engine file";
+    }
+    vector<char> bytes(istreambuf_iterator<char>(input), {});
+
+    IRuntime* runtime = createInferRuntime(gLogger);
+    ICudaEngine* engine = runtime->deserializeCudaEngine(bytes.data(), bytes.size(), nullptr);
+    assert(engine);
+
+    mContext = (engine->createExecutionContext());
+    assert(mContext);
+    mMaxBatchSize = engine->getMaxBatchSize();
+
+    engine->destroy();
+    runtime->destroy();
+}
+
+DynamicDriver::DynamicDriver(
+     const bool useFp16, const size_t maxWorkspaceSize, const OptProfiles& optProfiles)
+    : Driver(1, useFp16, maxWorkspaceSize)
+    , mOptProfiles(optProfiles)
+{
+}
+
+DynamicDriver::DynamicDriver(const std::string& enginePath)
+    : Driver(enginePath)
+{
+}
+
+NetworkDefinitionCreationFlags DynamicDriver::getNetworkFlags() const
+{
+    return (1U << static_cast<int>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH));
+}
+
+IBuilderConfig* DynamicDriver::getBuilderConfig() const
+{
+    auto config = Driver::getBuilderConfig();
+    for (auto& optProfile : mOptProfiles)
+    {
+        auto profile = mBuilder->createOptimizationProfile();
+        for (auto& kv : optProfile)
+        {
+            profile->setDimensions(kv.first.c_str(), OptProfileSelector::kMIN, get<OPIDX_MIN>(kv.second));
+            profile->setDimensions(kv.first.c_str(), OptProfileSelector::kMAX, get<OPIDX_MAX>(kv.second));
+            profile->setDimensions(kv.first.c_str(), OptProfileSelector::kOPT, get<OPIDX_OPT>(kv.second));
+        }
+        config->addOptimizationProfile(profile);
+    }
+    return config;
+}
+
+void DynamicDriver::allocateBindings()
+{
+    // dynamic shapes: setting each input binding to its maximum binding dimensions
+    // there should be a opt profile for each input
+    for (auto kv : mOptProfiles[0])//assuming there is only one opt profile - take its max dims
+    {
+        auto iidx = mEngine->getBindingIndex(kv.first.c_str());
+        mContext->setBindingDimensions(iidx, get<OPIDX_MAX>(kv.second));
+    }
+    assert(mContext->allInputDimensionsSpecified());
+
+    for (int i = 0; i < mEngine->getNbBindings(); i++)
+    {
+        auto bDims = mContext->getBindingDimensions(i);
+
+        size_t vol = samplesCommon::volume(bDims);
+        size_t elementSize = samplesCommon::getElementSize(mEngine->getBindingDataType(i));
+        size_t allocationSize = vol * elementSize;
+        gLogVerbose  << "Binding " << mEngine->getBindingName(i) << ": vol=" << vol << " wordSize="
+            << elementSize << " allocSize=" << allocationSize << " bytes" << std::endl;
+        mDeviceBuffers.emplace_back(DeviceBuffer(allocationSize, mEngine->getBindingDataType(i)));
+        mBuffers.emplace_back(mDeviceBuffers.back().data());
+    }
+}
+}
diff --git a/demo/BERT/bert/driver.h b/demo/BERT/bert/driver.h
new file mode 100644
index 00000000..cc473126
--- /dev/null
+++ b/demo/BERT/bert/driver.h
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TRT_DRIVER_H
+#define TRT_DRIVER_H
+
+#include "buffers.h"
+#include <typeinfo>
+#include <vector>
+
+namespace bert
+{
+
+struct HostTensor
+{
+    void* mData{nullptr};
+    size_t mNbBytes;
+    size_t mSize;
+    nvinfer1::DataType mType;
+    std::vector<size_t> mShape;
+
+    HostTensor(void* data, const nvinfer1::DataType type, const std::vector<size_t>& shape);
+};
+
+using HostTensorMap = std::map<std::string, std::shared_ptr<HostTensor>>;
+
+struct InferDeleter1
+{
+    template <typename T>
+    void operator()(T* obj) const
+    {
+        if (obj)
+        {
+            obj->destroy();
+        }
+    }
+};
+
+template <typename T>
+using SampleUniquePtr = std::unique_ptr<T, InferDeleter1>;
+
+struct Driver
+{
+    std::vector<void*> mBuffers;
+    std::vector<samplesCommon::DeviceBuffer> mDeviceBuffers;
+
+    nvinfer1::IBuilder* mBuilder{nullptr};
+    nvinfer1::ICudaEngine* mEngine{nullptr};
+    nvinfer1::IExecutionContext* mContext{nullptr};
+
+    int mMaxBatchSize;
+    size_t mMaxWorkspaceSize;
+    bool mUseFp16;
+
+    Driver(const int maxBatchSize, const bool useFp16, const size_t maxWorkspaceSize);
+
+    Driver(const std::string& enginePath);
+
+    virtual ~Driver();
+
+    virtual void buildNetwork(INetworkDefinition* network, const HostTensorMap& in);
+
+    virtual nvinfer1::NetworkDefinitionCreationFlags getNetworkFlags() const;
+    virtual nvinfer1::IBuilderConfig* getBuilderConfig() const;
+    virtual void allocateBindings();
+
+    void init(const HostTensorMap& params);
+
+    void h2d(const HostTensorMap& inCfg, cudaStream_t stream);
+
+    void d2h(HostTensorMap& outCfg, cudaStream_t stream);
+
+    void infer(const int batchSize, cudaStream_t stream);
+
+    void infer(const HostTensorMap& inCfg, HostTensorMap& outCfg, const int batchSize, cudaStream_t stream);
+
+    void benchmark(const HostTensorMap& inCfg, HostTensorMap& outCfg, const int batchSize, cudaStream_t stream,
+        std::vector<float>& timesTotal, std::vector<float>& timesCompute, const bool withMemcpy = true);
+
+    void serializeEngine(const std::string& enginePath) const;
+};
+
+constexpr uint32_t OPIDX_MIN = 0;
+constexpr uint32_t OPIDX_MAX = 1;
+constexpr uint32_t OPIDX_OPT = 2;
+
+using OptProfile = std::tuple<nvinfer1::Dims, nvinfer1::Dims, nvinfer1::Dims>;
+using OptProfileMap = std::map<std::string, OptProfile>;
+using OptProfiles = std::vector<OptProfileMap>;
+
+struct DynamicDriver : Driver
+{
+
+    OptProfiles mOptProfiles;
+
+    DynamicDriver(const bool useFp16, const size_t maxWorkspaceSize, const OptProfiles& optProfiles);
+
+    DynamicDriver(const std::string& enginePath);
+
+    nvinfer1::NetworkDefinitionCreationFlags getNetworkFlags() const override;
+    nvinfer1::IBuilderConfig* getBuilderConfig() const override;
+    void allocateBindings() override;
+};
+}
+#endif // TRT_DRIVER_H
diff --git a/demo/BERT/docker/build.sh b/demo/BERT/docker/build.sh
index fd598d59..e989c3d9 100755
--- a/demo/BERT/docker/build.sh
+++ b/demo/BERT/docker/build.sh
@@ -1,3 +1,18 @@
 #!/bin/bash
+#
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
 
 docker build --build-arg myuid=$(id -u) --build-arg mygid=$(id -g) --rm -t sample-bert  .
diff --git a/demo/BERT/docker/launch.sh b/demo/BERT/docker/launch.sh
index 9b954c90..43799bf2 100755
--- a/demo/BERT/docker/launch.sh
+++ b/demo/BERT/docker/launch.sh
@@ -1,4 +1,19 @@
 #!/bin/bash
+#
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
 
 docker run -it --rm \
     --name sample-bert \
diff --git a/demo/BERT/helpers/convert_weights.py b/demo/BERT/helpers/convert_weights.py
index de750c88..76415fb8 100644
--- a/demo/BERT/helpers/convert_weights.py
+++ b/demo/BERT/helpers/convert_weights.py
@@ -46,9 +46,10 @@
 with open(out_fn, 'wb') as output_file:
 
     # there might be training-related variables in the checkpoint that can be discarded
-    param_names = [key for key in sorted(tensor_dict) if 'adam' not in key and 'global_step' not in key and 'pooler' not in key]
+    exclude_list = ["adam", "global_step", "pooler", "bad_steps", "good_steps", "loss_scale"]
+    param_names = [key for key in sorted(tensor_dict) if all([exclude not in key for exclude in exclude_list])]
 
-    count = len(param_names) 
+    count = len(param_names)
     print(count)
 
     output_file.write('{}\n'.format(count).encode('ASCII'))
@@ -70,4 +71,3 @@
         output_file.write(flat_tensor.tobytes())
         output_file.write('\n'.encode('ASCII'));
         print('Orig.name:', pn,'TRT name:', outname, 'shape:' , shape_str)
-
diff --git a/demo/BERT/layers/attention.h b/demo/BERT/layers/attention.h
index 304f8b0a..08a33412 100644
--- a/demo/BERT/layers/attention.h
+++ b/demo/BERT/layers/attention.h
@@ -18,22 +18,22 @@
 #define TRT_ATTENTION_H
 
 #include "attentionKeys.h"
+#include "bertUtils.h"
 #include "qkvToContextPlugin.h"
 
 namespace bert
 {
 
-ILayer* attention(const std::string& prefix, const BertConfig& config, WeightMap& weightMap,
+inline ILayer* attentionDynamic(const std::string& prefix, const BertConfig& config, WeightMap& weightMap,
     INetworkDefinition* network, ITensor* inputTensor, ITensor* inputMask = nullptr)
 {
     assert(inputTensor);
     assert(network);
 
     const Dims idims = inputTensor->getDimensions();
-    assert(idims.nbDims == 4);
+    gLogVerbose << "Attention input dimensions: " <<idims.nbDims << std::endl;
 
-    const int S = idims.d[0];
-    const int hiddenSize = idims.d[1];
+    const int hiddenSize = config.hiddenSize;
     const int numHeads = config.numAttentionHeads;
     const int headSize = hiddenSize / numHeads;
 
@@ -43,14 +43,16 @@ ILayer* attention(const std::string& prefix, const BertConfig& config, WeightMap
     const Weights Ball = weightMap.at(prefix + BQKV);
 
     IFullyConnectedLayer* multAllLayer = network->addFullyConnected(*inputTensor, 3 * hiddenSize, Wall, Ball);
+    multAllLayer->setName((prefix+"FC_QKV").c_str());
     setOutputName(multAllLayer, prefix, "qkv_mult");
 
     ITensor* shuffleOut = multAllLayer->getOutput(0);
 
     const bool hasMask = inputMask != nullptr;
-    QKVToContextPlugin qkvPlugin("qkv2ctx", hiddenSize, numHeads, S, hasMask);
+    test::QKVToContextPluginDynamic qkvPlugin("qkv2ctx", hiddenSize, numHeads, hasMask);
     ITensor* qkvIn[2] = {shuffleOut, inputMask};
     IPluginV2Layer* qkv2ctxLayer = network->addPluginV2(qkvIn, 1 + hasMask, qkvPlugin);
+    qkv2ctxLayer->setName((prefix + "QKV2CTX").c_str());
     setOutputName(qkv2ctxLayer, prefix, "context_layer");
     return qkv2ctxLayer;
 }
diff --git a/demo/BERT/layers/bertEncoder.h b/demo/BERT/layers/bertEncoder.h
index 2d205113..5dd5136e 100644
--- a/demo/BERT/layers/bertEncoder.h
+++ b/demo/BERT/layers/bertEncoder.h
@@ -18,11 +18,12 @@
 #define TRT_BERT_ENCODER_H
 
 #include "transformer.h"
+#include <sstream>
 
 namespace bert
 {
 
-ILayer* bertModel(const BertConfig& config, WeightMap& weightMap, INetworkDefinition* network, ITensor* inputTensor,
+inline ILayer* bertModelDynamic(const BertConfig& config, WeightMap& weightMap, INetworkDefinition* network, ITensor* inputTensor,
     ITensor* input_mask = nullptr)
 {
 
@@ -34,7 +35,7 @@ ILayer* bertModel(const BertConfig& config, WeightMap& weightMap, INetworkDefini
         std::stringstream ss;
         ss << "l" << layer << "_";
 
-        prevLayer = transformer(ss.str(), config, weightMap, network, prevInput, input_mask);
+        prevLayer = transformerDynamic(ss.str(), config, weightMap, network, prevInput, input_mask);
         prevInput = prevLayer->getOutput(0);
     }
     assert(prevLayer);
diff --git a/demo/BERT/layers/squad.h b/demo/BERT/layers/squad.h
index c9ef9e80..986f65cf 100644
--- a/demo/BERT/layers/squad.h
+++ b/demo/BERT/layers/squad.h
@@ -23,18 +23,18 @@ namespace bert
 const std::string SQD_W = "squad_output_weights";
 const std::string SQD_B = "squad_output_bias";
 
-ILayer* squad(const std::string& prefix, const BertConfig& config, WeightMap& weightMap, INetworkDefinition* network,
+inline ILayer* squadDynamic(const std::string& prefix, const BertConfig& config, WeightMap& weightMap, INetworkDefinition* network,
     ITensor* inputTensor)
 {
 
     assert(inputTensor);
     assert(network);
 
-    const Dims idims = inputTensor->getDimensions();
-    assert(idims.nbDims == 4);
+    //const Dims idims = inputTensor->getDimensions();
+    //assert(idims.nbDims == 5);
 
-    const int S = idims.d[0];
-    const int hiddenSize = idims.d[1];
+    //const int S = idims.d[1];
+    const int hiddenSize = config.hiddenSize; //idims.d[2];
 
     const Weights W_out = weightMap.at(prefix + SQD_W);
     const Weights B_out = weightMap.at(prefix + SQD_B);
diff --git a/demo/BERT/layers/transformer.h b/demo/BERT/layers/transformer.h
index e1eef017..6a9eb3e3 100644
--- a/demo/BERT/layers/transformer.h
+++ b/demo/BERT/layers/transformer.h
@@ -24,36 +24,36 @@
 
 namespace bert
 {
-
-ILayer* skipln(
-    const std::string& prefix, WeightMap& weightMap, INetworkDefinition* network, ITensor* inputTensor, ITensor* skip)
+inline ILayer* skiplnDynamic(
+    const std::string& prefix,const BertConfig & config,  WeightMap& weightMap, INetworkDefinition* network, ITensor* inputTensor, ITensor* skip)
 {
 
-    const Dims idims = inputTensor->getDimensions();
-    assert(idims.nbDims == 4);
-    const int hiddenSize = idims.d[1];
+    //const Dims idims = inputTensor->getDimensions();
+    //assert(idims.nbDims == 5);
+    //const int hiddenSize = idims.d[2];
+    const int hiddenSize = config.hiddenSize;
 
     const Weights& wbeta = weightMap.at(prefix + "beta");
     const Weights& wgamma = weightMap.at(prefix + "gamma");
-    SkipLayerNormPlugin skipln_plug("skipln", hiddenSize, wbeta, wgamma);
+    test::SkipLayerNormPluginDynamic skipln_plug("skipln", hiddenSize, wbeta, wgamma);
     ITensor* skiplnInputs[2] = {inputTensor, skip};
 
     IPluginV2Layer* skiplnLayer = network->addPluginV2(skiplnInputs, 2, skipln_plug);
     return skiplnLayer;
 }
 
-ILayer* transformer(const std::string& prefix, const BertConfig& config, WeightMap& weightMap,
+inline ILayer* transformerDynamic(const std::string& prefix, const BertConfig& config, WeightMap& weightMap,
     INetworkDefinition* network, ITensor* inputTensor, ITensor* imask = nullptr)
 {
 
     assert(inputTensor);
     assert(network);
 
-    const Dims idims = inputTensor->getDimensions();
-    assert(idims.nbDims == 4);
-    const int hiddenSize = idims.d[1];
+    //const Dims idims = inputTensor->getDimensions();
+    //assert(idims.nbDims == 5);
+    const int hiddenSize = config.hiddenSize;
 
-    ILayer* attentionHeads = attention(prefix + "attention_self_", config, weightMap, network, inputTensor, imask);
+    ILayer* attentionHeads = attentionDynamic(prefix + "attention_self_", config, weightMap, network, inputTensor, imask);
 
     const Weights wA = weightMap.at(prefix + W_AOUT);
     const Weights bA = weightMap.at(prefix + B_AOUT);
@@ -61,7 +61,7 @@ ILayer* transformer(const std::string& prefix, const BertConfig& config, WeightM
     IFullyConnectedLayer* attOutFCLayer = network->addFullyConnected(*attentionHeads->getOutput(0), hiddenSize, wA, bA);
 
     ILayer* attLNLayer
-        = skipln(prefix + "attention_output_layernorm_", weightMap, network, attOutFCLayer->getOutput(0), inputTensor);
+        = skiplnDynamic(prefix + "attention_output_layernorm_",config, weightMap, network, attOutFCLayer->getOutput(0), inputTensor);
 
     const Weights wMid = weightMap.at(prefix + W_MID);
     const Weights bMid = weightMap.at(prefix + B_MID);
@@ -70,7 +70,7 @@ ILayer* transformer(const std::string& prefix, const BertConfig& config, WeightM
         = network->addFullyConnected(*attLNLayer->getOutput(0), config.intermediateSize, wMid, bMid);
 
     // gelu
-    auto geluPlugin = GeluPlugin("gelu");
+    auto geluPlugin = test::GeluPluginDynamic("gelu");
     ITensor* midDenseOut = midDenseLayer->getOutput(0);
     IPluginV2Layer* geluLayer = network->addPluginV2(&midDenseOut, 1, geluPlugin);
     ITensor* midAct = geluLayer->getOutput(0);
@@ -84,8 +84,8 @@ ILayer* transformer(const std::string& prefix, const BertConfig& config, WeightM
     IFullyConnectedLayer* outDenseLayer = network->addFullyConnected(*midAct, hiddenSize, wL, bL);
     setOutputName(outDenseLayer, prefix + "output_", "dense");
 
-    ILayer* outLNLayer = skipln(
-        prefix + "output_layernorm_", weightMap, network, outDenseLayer->getOutput(0), attLNLayer->getOutput(0));
+    ILayer* outLNLayer = skiplnDynamic(
+        prefix + "output_layernorm_", config, weightMap, network, outDenseLayer->getOutput(0), attLNLayer->getOutput(0));
 
     assert(outLNLayer);
 
diff --git a/demo/BERT/plugins/embLayerNormPlugin.cu b/demo/BERT/plugins/embLayerNormPlugin.cu
index 69857ec4..75e48e15 100644
--- a/demo/BERT/plugins/embLayerNormPlugin.cu
+++ b/demo/BERT/plugins/embLayerNormPlugin.cu
@@ -14,21 +14,26 @@
  * limitations under the License.
  */
 
+#include <cassert>
+#include <cstring>
+#include <vector>
+
 #include "NvInfer.h"
+#include "common.h"
 #include "embLayerNormPlugin.h"
 #include "logger.h"
 #include "pluginKernels.h"
 #include "pluginUtil.h"
 
-#include <cassert>
-#include <cstring>
-#include <vector>
-
+using namespace nvinfer1;
 using bert::operator+;
 
 namespace bert
 {
 
+namespace test
+{
+
 template <typename T, unsigned TPB>
 __global__ void embLayerNormKernel(int ld, const int* inputIds, const int* tokenIds, const float* beta,
     const float* gamma, const float* wordEmb, const float* posEmb, const float* tokEmb, T* output)
@@ -79,7 +84,7 @@ __global__ void embLayerNormKernel(int ld, const int* inputIds, const int* token
 }
 
 template <typename T>
-int embSkipLayerNorm(cudaStream_t stream, int ld, int B, int S, const int* inputIds, const int* token_ids,
+inline int embSkipLayerNorm(cudaStream_t stream, int ld, int B, int S, const int* inputIds, const int* token_ids,
     const float* beta, const float* gamma, const float* wordEmb, const float* posEmb, const float* tokEmb, T* output)
 {
 
@@ -94,23 +99,21 @@ int embSkipLayerNorm(cudaStream_t stream, int ld, int B, int S, const int* input
     return 0;
 }
 
-using namespace nvinfer1;
-
 // Clip plugin specific constants
 namespace
 {
 static const char* EMB_LAYER_NORM_VERSION{"1"};
-static const char* EMB_LAYER_NORM_NAME{"CustomEmbLayerNormPlugin"};
+static const char* EMB_LAYER_NORM_NAME{"CustomEmbLayerNormPluginDynamic"};
 } // namespace
 
 // Static class fields initialization
-PluginFieldCollection EmbLayerNormPluginCreator::mFC{};
-std::vector<PluginField> EmbLayerNormPluginCreator::mPluginAttributes;
+PluginFieldCollection EmbLayerNormPluginDynamicCreator::mFC{};
+std::vector<PluginField> EmbLayerNormPluginDynamicCreator::mPluginAttributes;
 
-REGISTER_TENSORRT_PLUGIN(EmbLayerNormPluginCreator);
+REGISTER_TENSORRT_PLUGIN(EmbLayerNormPluginDynamicCreator);
 
-EmbLayerNormPlugin::EmbLayerNormPlugin(const std::string& name, const bool outputFp16, const Weights& beta,
-    const Weights& gamma, const Weights& wordEmb, const Weights& posEmb, const Weights& tokEmb)
+EmbLayerNormPluginDynamic::EmbLayerNormPluginDynamic(const std::string& name, const bool outputFp16,
+    const Weights& beta, const Weights& gamma, const Weights& wordEmb, const Weights& posEmb, const Weights& tokEmb)
     : mLayerName(name)
     , mLd(beta.count)
     , mGamma(gamma)
@@ -136,7 +139,7 @@ EmbLayerNormPlugin::EmbLayerNormPlugin(const std::string& name, const bool outpu
     mType = outputFp16 ? DataType::kHALF : DataType::kFLOAT;
 }
 
-EmbLayerNormPlugin::EmbLayerNormPlugin(const std::string& name, const void* data, size_t length)
+EmbLayerNormPluginDynamic::EmbLayerNormPluginDynamic(const std::string& name, const void* data, size_t length)
     : mLayerName(name)
 {
     gLogVerbose << "EMB LN Deser start\n";
@@ -171,23 +174,160 @@ EmbLayerNormPlugin::EmbLayerNormPlugin(const std::string& name, const void* data
     gLogVerbose << "EMB LN Deser done\n";
 }
 
-const char* EmbLayerNormPlugin::getPluginType() const
+// IPluginV2DynamicExt Methods
+IPluginV2DynamicExt* EmbLayerNormPluginDynamic::clone() const
 {
-    return EMB_LAYER_NORM_NAME;
+    gLogVerbose << "EMBLN clone start" << std::endl;
+    auto ret = new EmbLayerNormPluginDynamic(
+        mLayerName, mType == DataType::kHALF, mBeta, mGamma, mWordEmb, mPosEmb, mTokEmb);
+    ret->mS = mS;
+
+    ret->mWordEmbDev = mWordEmbDev;
+    ret->mPosEmbDev = mPosEmbDev;
+    ret->mTokEmbDev = mTokEmbDev;
+    ret->mBetaDev = mBetaDev;
+    ret->mGammaDev = mGammaDev;
+    gLogVerbose << "EMBLN clone done" << std::endl;
+    return ret;
 }
 
-const char* EmbLayerNormPlugin::getPluginVersion() const
+DimsExprs EmbLayerNormPluginDynamic::getOutputDimensions(int outputIndex, const DimsExprs* inputs, int nbInputs, IExprBuilder& exprBuilder)
 {
-    return EMB_LAYER_NORM_VERSION;
+    // Input should be input ids and token ids and the input mask
+    // Output should be the embeddings tensor and mask indices
+    assert(nbInputs == 3);
+
+    assert(inputs[0].nbDims == 2); // BxS
+    assert(inputs[0].nbDims == inputs[1].nbDims);
+    assert(inputs[0].nbDims == inputs[2].nbDims);
+
+    assert(outputIndex == 0 || outputIndex == 1);
+
+    if (outputIndex == 0)
+    {
+        DimsExprs ret;
+        ret.nbDims = 5;
+        ret.d[0] = inputs[0].d[0];
+        ret.d[1] = inputs[0].d[1];
+        ret.d[2] = exprBuilder.constant(mLd);
+        ret.d[3] = exprBuilder.constant(1);
+        ret.d[4] = exprBuilder.constant(1);
+        return ret;
+    }
+
+    DimsExprs ret;
+    ret.nbDims = 1;
+    ret.d[0] = inputs[0].d[BDIM];
+    return ret;
 }
 
-int EmbLayerNormPlugin::getNbOutputs() const
+bool EmbLayerNormPluginDynamic::supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs)
 {
-    return 2;
+    // 3 inputs of size BxS
+    assert(nbInputs == 3);
+    assert(nbOutputs == 2);
+
+    const PluginTensorDesc& desc = inOut[pos];
+    if (pos == 0)
+    {
+        return desc.type == DataType::kINT32 && desc.format == TensorFormat::kLINEAR && desc.dims.nbDims == 2;
+    }
+
+    const PluginTensorDesc& prev = inOut[pos - 1];
+    if (pos == 1 || pos == 2)
+    {
+        return desc.type == DataType::kINT32 && desc.format == TensorFormat::kLINEAR && desc.dims.nbDims == 2
+            && desc.dims.d[BDIM] == prev.dims.d[BDIM] && desc.dims.d[SDIM] == prev.dims.d[SDIM];
+    }
+
+    if (pos == 3)
+    { // embedded sequence
+        return desc.type == mType && desc.format == TensorFormat::kLINEAR && desc.dims.nbDims == 5
+            && desc.dims.d[BDIM] == prev.dims.d[BDIM] && desc.dims.d[SDIM] == prev.dims.d[SDIM]
+            && desc.dims.d[3] == 1 && desc.dims.d[4] == 1;
+    }
+    // pos == 4: mask
+    return desc.type == DataType::kINT32 && desc.format == TensorFormat::kLINEAR
+        && desc.dims.nbDims == 1 && desc.dims.d[BDIM] == prev.dims.d[BDIM];
+}
+
+void EmbLayerNormPluginDynamic::configurePlugin(const DynamicPluginTensorDesc* inputs, int nbInputs,
+    const DynamicPluginTensorDesc* outputs, int nbOutputs)
+{
+    // Validate input arguments
+    assert(nbOutputs == 2);
+    assert(nbInputs == 3);
+
+    assert(inputs[0].desc.dims.nbDims == 2);
+    mS = inputs[0].desc.dims.d[SDIM];
+    const int B = inputs[0].desc.dims.d[BDIM];
+    assert(mS == inputs[1].desc.dims.d[SDIM]);
+    assert(B == inputs[1].desc.dims.d[BDIM]);
+    assert(mS == inputs[2].desc.dims.d[SDIM]);
+    assert(B == inputs[2].desc.dims.d[BDIM]);
+
+    assert(outputs[0].desc.dims.nbDims == 5);
+    assert(outputs[0].desc.dims.d[SDIM] == mS);
+    assert(outputs[0].desc.dims.d[BDIM] == B);
+    assert(outputs[0].desc.dims.d[2] == mLd);
+    assert(outputs[0].desc.dims.d[3] == 1);
+    assert(outputs[0].desc.dims.d[4] == 1);
+
+    assert(outputs[1].desc.dims.nbDims == 1);
+    assert(outputs[1].desc.dims.d[0] == B);
+
+    assert(inputs[0].desc.type== DataType::kINT32);
+    assert(inputs[1].desc.type== DataType::kINT32);
+    assert(inputs[2].desc.type== DataType::kINT32);
+    const DataType out_type = outputs[0].desc.type;
+    assert(out_type == DataType::kFLOAT || out_type == DataType::kHALF);
+    assert(outputs[1].desc.type == DataType::kINT32);
+}
+
+size_t EmbLayerNormPluginDynamic::getWorkspaceSize(const PluginTensorDesc* inputs, int nbInputs,
+    const PluginTensorDesc* outputs, int nbOutputs) const
+{
+    return 0;
+}
+
+int EmbLayerNormPluginDynamic::enqueue(const PluginTensorDesc* inputDesc, const PluginTensorDesc* outputDesc,
+    const void* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream)
+{
+    const int batchSize = inputDesc->dims.d[BDIM];
+    const int S = inputDesc->dims.d[SDIM];
+    int status = -1;
+
+    // Our plugin outputs only one tensor
+    const int* inputIds = static_cast<const int*>(inputs[0]);
+    const int* segmentIds = static_cast<const int*>(inputs[1]);
+    const int* inputMask = static_cast<const int*>(inputs[2]);
+
+    if (mType == DataType::kFLOAT)
+    {
+        float* output = static_cast<float*>(outputs[0]);
+        embSkipLayerNorm<float>(stream, mLd, batchSize, S, inputIds, segmentIds, mBetaDev, mGammaDev, mWordEmbDev,
+            mPosEmbDev, mTokEmbDev, output);
+    }
+    else if (mType == DataType::kHALF)
+    {
+        half* output = static_cast<half*>(outputs[0]);
+        embSkipLayerNorm<half>(stream, mLd, batchSize, S, inputIds, segmentIds, mBetaDev, mGammaDev, mWordEmbDev,
+            mPosEmbDev, mTokEmbDev, output);
+    }
+    else
+    {
+        assert(false);
+    }
+    int* maskIdx = static_cast<int*>(outputs[1]);
+    computeMaskIdx(stream, S, batchSize, inputMask, maskIdx);
+
+    return status;
 }
 
-DataType EmbLayerNormPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const
+// IPluginV2Ext Methods
+DataType EmbLayerNormPluginDynamic::getOutputDataType(int index, const DataType* inputTypes, int nbInputs) const
 {
+
     assert(index == 0 || index == 1);
     if (index == 0)
     {
@@ -197,28 +337,23 @@ DataType EmbLayerNormPlugin::getOutputDataType(int index, const nvinfer1::DataTy
     return DataType::kINT32;
 }
 
-Dims EmbLayerNormPlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims)
+// IPluginV2 Methods
+const char* EmbLayerNormPluginDynamic::getPluginType() const
 {
-    // Input should be input ids and token ids and the input mask
-    // Output should be the embeddings tensor and mask indices
-    assert(nbInputDims == 3);
-    assert(inputs[0].nbDims == 1); // S
-    assert(inputs[0].nbDims == inputs[1].nbDims);
-    const int S = inputs[0].d[0];
-    assert(inputs[1].d[0] == S);
-    assert(inputs[2].d[0] == S);
+    return EMB_LAYER_NORM_NAME;
+}
 
-    assert(index == 0 || index == 1);
+const char* EmbLayerNormPluginDynamic::getPluginVersion() const
+{
+    return EMB_LAYER_NORM_VERSION;
+}
 
-    if (index == 0)
-    {
-        const int hidden_size = mLd;
-        return Dims4{S, hidden_size, 1, 1};
-    }
-    return Dims{1, 1};
+int EmbLayerNormPluginDynamic::getNbOutputs() const
+{
+    return 2;
 }
 
-int EmbLayerNormPlugin::initialize()
+int EmbLayerNormPluginDynamic::initialize()
 {
     if (mGamma.values)
     {
@@ -250,38 +385,18 @@ int EmbLayerNormPlugin::initialize()
     return 0;
 }
 
-int EmbLayerNormPlugin::enqueue(int batchSize, const void* const* inputs, void** outputs, void*, cudaStream_t stream)
+void EmbLayerNormPluginDynamic::terminate()
 {
-    int status = -1;
-
-    // Our plugin outputs only one tensor
-    const int* inputIds = static_cast<const int*>(inputs[0]);
-    const int* segmentIds = static_cast<const int*>(inputs[1]);
-    const int* inputMask = static_cast<const int*>(inputs[2]);
-
-    if (mType == DataType::kFLOAT)
-    {
-        float* output = static_cast<float*>(outputs[0]);
-        embSkipLayerNorm<float>(stream, mLd, batchSize, mS, inputIds, segmentIds, mBetaDev, mGammaDev, mWordEmbDev,
-            mPosEmbDev, mTokEmbDev, output);
-    }
-    else if (mType == DataType::kHALF)
-    {
-        half* output = static_cast<half*>(outputs[0]);
-        embSkipLayerNorm<half>(stream, mLd, batchSize, mS, inputIds, segmentIds, mBetaDev, mGammaDev, mWordEmbDev,
-            mPosEmbDev, mTokEmbDev, output);
-    }
-    else
-    {
-        assert(false);
-    }
-    int* maskIdx = static_cast<int*>(outputs[1]);
-    computeMaskIdx(stream, mS, batchSize, inputMask, maskIdx);
-
-    return status;
+    gLogVerbose << "EMBLN terminate start" << std::endl;
+    CHECK(cudaFree(mGammaDev));
+    CHECK(cudaFree(mBetaDev));
+    CHECK(cudaFree(mWordEmbDev));
+    CHECK(cudaFree(mTokEmbDev));
+    CHECK(cudaFree(mPosEmbDev));
+    gLogVerbose << "EMBLN terminate done" << std::endl;
 }
 
-size_t EmbLayerNormPlugin::getSerializationSize() const
+size_t EmbLayerNormPluginDynamic::getSerializationSize() const
 {
     return 2 * sizeof(float) * mLd             // beta + gamma
         + sizeof(mType) + sizeof(mLd) * 5      //mLd, mS, m*VocabSize
@@ -291,7 +406,7 @@ size_t EmbLayerNormPlugin::getSerializationSize() const
         ;
 }
 
-void EmbLayerNormPlugin::serialize(void* buffer) const
+void EmbLayerNormPluginDynamic::serialize(void* buffer) const
 {
     char* d = static_cast<char*>(buffer);
     const char* a = d;
@@ -310,62 +425,7 @@ void EmbLayerNormPlugin::serialize(void* buffer) const
     assert(d == a + getSerializationSize());
 }
 
-void EmbLayerNormPlugin::configurePlugin(const Dims* inputs, int nbInputs, const Dims* outputs, int nbOutputs,
-    const DataType* inputTypes, const DataType* outputTypes, const bool* inputIsBroadcast,
-    const bool* outputIsBroadcast, PluginFormat format, int maxBatchSize)
-{
-
-    // Validate input arguments
-    assert(nbOutputs == 2);
-    assert(nbInputs == 3);
-
-    assert(inputs[0].nbDims == 1);
-    mS = inputs[0].d[0];
-    assert(mS == inputs[1].d[0]);
-    assert(mS == inputs[2].d[0]);
-
-    assert(outputs[0].nbDims == 4);
-    assert(outputs[0].d[0] == mS);
-    assert(outputs[0].d[1] == mLd);
-    assert(outputs[0].d[2] == 1);
-    assert(outputs[0].d[3] == 1);
-
-    assert(outputs[1].nbDims == 1);
-    assert(outputs[1].d[0] == 1);
-
-    assert(format == PluginFormat::kNCHW);
-    assert(inputTypes[0] == DataType::kINT32);
-    assert(inputTypes[1] == DataType::kINT32);
-    assert(inputTypes[2] == DataType::kINT32);
-    const DataType out_type = outputTypes[0];
-    assert(out_type == DataType::kFLOAT || out_type == DataType::kHALF);
-    assert(outputTypes[1] == DataType::kINT32);
-}
-
-bool EmbLayerNormPlugin::supportsFormat(DataType type, PluginFormat format) const
-{
-    if (type == DataType::kINT32 || type == DataType::kFLOAT || type == DataType::kHALF)
-    {
-        return format == PluginFormat::kNCHW;
-    }
-    else
-    {
-        return false;
-    }
-}
-
-void EmbLayerNormPlugin::terminate()
-{
-    gLogVerbose << "EMBLN terminate start" << std::endl;
-    cudaFree(mGammaDev);
-    cudaFree(mBetaDev);
-    cudaFree(mWordEmbDev);
-    cudaFree(mTokEmbDev);
-    cudaFree(mPosEmbDev);
-    gLogVerbose << "EMBLN terminate done" << std::endl;
-}
-
-void EmbLayerNormPlugin::destroy()
+void EmbLayerNormPluginDynamic::destroy()
 {
     gLogVerbose << "EMBLN destroy start" << std::endl;
     // This gets called when the network containing plugin is destroyed
@@ -373,55 +433,42 @@ void EmbLayerNormPlugin::destroy()
     gLogVerbose << "EMBLN destroy start" << std::endl;
 }
 
-IPluginV2Ext* EmbLayerNormPlugin::clone() const
-{
-    gLogVerbose << "EMBLN clone start" << std::endl;
-    auto ret = new EmbLayerNormPlugin(mLayerName, mType == DataType::kHALF, mBeta, mGamma, mWordEmb, mPosEmb, mTokEmb);
-    ret->mS = mS;
-
-    ret->mWordEmbDev = mWordEmbDev;
-    ret->mPosEmbDev = mPosEmbDev;
-    ret->mTokEmbDev = mTokEmbDev;
-    ret->mBetaDev = mBetaDev;
-    ret->mGammaDev = mGammaDev;
-    gLogVerbose << "EMBLN clone done" << std::endl;
-    return ret;
-}
-
-void EmbLayerNormPlugin::setPluginNamespace(const char* libNamespace)
+void EmbLayerNormPluginDynamic::setPluginNamespace(const char* libNamespace)
 {
     mNamespace = libNamespace;
 }
 
-const char* EmbLayerNormPlugin::getPluginNamespace() const
+const char* EmbLayerNormPluginDynamic::getPluginNamespace() const
 {
     return mNamespace.c_str();
 }
 
-EmbLayerNormPluginCreator::EmbLayerNormPluginCreator()
+///////////////////////
+
+EmbLayerNormPluginDynamicCreator::EmbLayerNormPluginDynamicCreator()
 {
     mFC.nbFields = mPluginAttributes.size();
     mFC.fields = mPluginAttributes.data();
 }
 
-const char* EmbLayerNormPluginCreator::getPluginName() const
+const char* EmbLayerNormPluginDynamicCreator::getPluginName() const
 {
     return EMB_LAYER_NORM_NAME;
 }
 
-const char* EmbLayerNormPluginCreator::getPluginVersion() const
+const char* EmbLayerNormPluginDynamicCreator::getPluginVersion() const
 {
     return EMB_LAYER_NORM_VERSION;
 }
 
-const PluginFieldCollection* EmbLayerNormPluginCreator::getFieldNames()
+const PluginFieldCollection* EmbLayerNormPluginDynamicCreator::getFieldNames()
 {
     return &mFC;
 }
 
-IPluginV2* EmbLayerNormPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc)
+IPluginV2* EmbLayerNormPluginDynamicCreator::createPlugin(const char* name, const PluginFieldCollection* fc)
 {
-    gLogVerbose << "Creating EmbLayerNormPlugin...\n";
+    gLogVerbose << "Creating EmbLayerNormPluginDynamic...\n";
 
     bool output_fp16 = true;
     Weights beta;
@@ -429,10 +476,10 @@ IPluginV2* EmbLayerNormPluginCreator::createPlugin(const char* name, const Plugi
     Weights word_emb;
     Weights pos_emb;
     Weights tok_emb;
-    for(int i=0; i< fc->nbFields; i++)
+    for (int i = 0; i < fc->nbFields; i++)
     {
         std::string field_name(fc->fields[i].name);
-        if (field_name.compare("bert_embeddings_layernorm_beta")==0)
+        if (field_name.compare("bert_embeddings_layernorm_beta") == 0)
         {
             gLogVerbose << "Building bert_embeddings_layernorm_beta...\n";
             beta.values = fc->fields[i].data;
@@ -440,7 +487,7 @@ IPluginV2* EmbLayerNormPluginCreator::createPlugin(const char* name, const Plugi
             beta.type = static_cast<DataType>(fc->fields[i].type);
         }
 
-        if (field_name.compare("bert_embeddings_layernorm_gamma")==0)
+        if (field_name.compare("bert_embeddings_layernorm_gamma") == 0)
         {
             gLogVerbose << "Building bert_embeddings_layernorm_gamma...\n";
             gamma.values = fc->fields[i].data;
@@ -448,7 +495,7 @@ IPluginV2* EmbLayerNormPluginCreator::createPlugin(const char* name, const Plugi
             gamma.type = static_cast<DataType>(fc->fields[i].type);
         }
 
-        if (field_name.compare("bert_embeddings_word_embeddings")==0)
+        if (field_name.compare("bert_embeddings_word_embeddings") == 0)
         {
             gLogVerbose << "Building bert_embeddings_word_embeddings...\n";
             word_emb.values = fc->fields[i].data;
@@ -456,7 +503,7 @@ IPluginV2* EmbLayerNormPluginCreator::createPlugin(const char* name, const Plugi
             word_emb.type = static_cast<DataType>(fc->fields[i].type);
         }
 
-        if (field_name.compare("bert_embeddings_token_type_embeddings")==0)
+        if (field_name.compare("bert_embeddings_token_type_embeddings") == 0)
         {
             gLogVerbose << "Building bert_embeddings_token_type_embeddings...\n";
             tok_emb.values = fc->fields[i].data;
@@ -464,7 +511,7 @@ IPluginV2* EmbLayerNormPluginCreator::createPlugin(const char* name, const Plugi
             tok_emb.type = static_cast<DataType>(fc->fields[i].type);
         }
 
-        if (field_name.compare("bert_embeddings_position_embeddings")==0)
+        if (field_name.compare("bert_embeddings_position_embeddings") == 0)
         {
             gLogVerbose << "Building bert_embeddings_position_embeddings...\n";
             pos_emb.values = fc->fields[i].data;
@@ -474,24 +521,27 @@ IPluginV2* EmbLayerNormPluginCreator::createPlugin(const char* name, const Plugi
     }
 
     gLogVerbose << "Building the Plugin...\n";
-    EmbLayerNormPlugin* p =  new EmbLayerNormPlugin(name, output_fp16, beta, gamma, word_emb, pos_emb, tok_emb);
+    EmbLayerNormPluginDynamic* p
+        = new EmbLayerNormPluginDynamic(name, output_fp16, beta, gamma, word_emb, pos_emb, tok_emb);
     return p;
 }
 
-IPluginV2* EmbLayerNormPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength)
+IPluginV2* EmbLayerNormPluginDynamicCreator::deserializePlugin(
+    const char* name, const void* serialData, size_t serialLength)
 {
     // This object will be deleted when the network is destroyed, which will
-    // call EmbLayerNormPlugin::destroy()
-    return new EmbLayerNormPlugin(name, serialData, serialLength);
+    // call EmbLayerNormPluginDynamic::destroy()
+    return new EmbLayerNormPluginDynamic(name, serialData, serialLength);
 }
 
-void EmbLayerNormPluginCreator::setPluginNamespace(const char* libNamespace)
+void EmbLayerNormPluginDynamicCreator::setPluginNamespace(const char* libNamespace)
 {
     mNamespace = libNamespace;
 }
 
-const char* EmbLayerNormPluginCreator::getPluginNamespace() const
+const char* EmbLayerNormPluginDynamicCreator::getPluginNamespace() const
 {
     return mNamespace.c_str();
 }
 }
+}
diff --git a/demo/BERT/plugins/embLayerNormPlugin.h b/demo/BERT/plugins/embLayerNormPlugin.h
index b62c24a1..a8a66cde 100644
--- a/demo/BERT/plugins/embLayerNormPlugin.h
+++ b/demo/BERT/plugins/embLayerNormPlugin.h
@@ -18,79 +18,59 @@
 #define TRT_EMB_LAYER_NORM_PLUGIN_H
 
 #include "NvInferPlugin.h"
+#include "NvInferRuntime.h"
 #include <string>
 #include <vector>
 namespace bert
 {
 
-using namespace nvinfer1;
+namespace test
+{
 
 // One of the preferred ways of making TensorRT to be able to see
 // our custom layer requires extending IPluginV2 and IPluginCreator classes.
 // For requirements for overriden functions, check TensorRT API docs.
 
-class EmbLayerNormPlugin : public IPluginV2Ext
+class EmbLayerNormPluginDynamic : public nvinfer1::IPluginV2DynamicExt
 {
 public:
-    EmbLayerNormPlugin(const std::string& name, const bool use_fp16, const Weights& beta, const Weights& gamma,
+    EmbLayerNormPluginDynamic(const std::string& name, const bool use_fp16, const Weights& beta, const Weights& gamma,
         const Weights& word_emb, const Weights& pos_emb, const Weights& tok_emb);
 
-    EmbLayerNormPlugin(const std::string& name, const void* data, size_t length);
+    EmbLayerNormPluginDynamic(const std::string& name, const void* data, size_t length);
 
-    // It doesn't make sense to make EmbLayerNormPlugin without arguments, so we
+    // It doesn't make sense to make EmbLayerNormPluginDynamic without arguments, so we
     // delete default constructor.
-    EmbLayerNormPlugin() = delete;
-
+    EmbLayerNormPluginDynamic() = delete;
+
+    // IPluginV2DynamicExt Methods
+    nvinfer1::IPluginV2DynamicExt* clone() const override;
+    nvinfer1::DimsExprs getOutputDimensions(
+        int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, nvinfer1::IExprBuilder& exprBuilder) override;
+    bool supportsFormatCombination(
+        int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) override;
+    void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs,
+        const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) override;
+    size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs,
+        const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const override;
+    int enqueue(const nvinfer1::PluginTensorDesc* inputDesc, const nvinfer1::PluginTensorDesc* outputDesc,
+        const void* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) override;
+
+    // IPluginV2Ext Methods
+    nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const override;
+
+    // IPluginV2 Methods
+    const char* getPluginType() const override;
+    const char* getPluginVersion() const override;
     int getNbOutputs() const override;
-
-    Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override;
-
     int initialize() override;
-
     void terminate() override;
-
-    size_t getWorkspaceSize(int) const override
-    {
-        return 0;
-    };
-
-    int enqueue(
-        int batchSize, const void* const* inputs, void** outputs, void* workspace, cudaStream_t stream) override;
-
     size_t getSerializationSize() const override;
-
     void serialize(void* buffer) const override;
-
-    bool supportsFormat(DataType type, PluginFormat format) const override;
-
-    const char* getPluginType() const override;
-
-    const char* getPluginVersion() const override;
-
     void destroy() override;
-
-    nvinfer1::IPluginV2Ext* clone() const override;
-
     void setPluginNamespace(const char* pluginNamespace) override;
-
     const char* getPluginNamespace() const override;
 
-    DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const override;
-
-    bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const
-    {
-        return false;
-    }
-
-    bool canBroadcastInputAcrossBatch(int inputIndex) const
-    {
-        return false;
-    }
-
-    void configurePlugin(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs,
-        const DataType* inputTypes, const DataType* outputTypes, const bool* inputIsBroadcast,
-        const bool* outputIsBroadcast, PluginFormat floatFormat, int maxBatchSize) override;
-
 private:
     const std::string mLayerName;
     std::string mNamespace;
@@ -106,37 +86,38 @@ class EmbLayerNormPlugin : public IPluginV2Ext
     size_t mWordVocabSize;
     size_t mPosVocabSize;
     size_t mTokVocabSize;
-    Weights mBeta;
-    Weights mGamma;
-    Weights mWordEmb;
-    Weights mTokEmb;
-    Weights mPosEmb;
-    DataType mType;
+    nvinfer1::Weights mBeta;
+    nvinfer1::Weights mGamma;
+    nvinfer1::Weights mWordEmb;
+    nvinfer1::Weights mTokEmb;
+    nvinfer1::Weights mPosEmb;
+    nvinfer1::DataType mType;
 };
 
-class EmbLayerNormPluginCreator : public IPluginCreator
+class EmbLayerNormPluginDynamicCreator : public nvinfer1::IPluginCreator
 {
 public:
-    EmbLayerNormPluginCreator();
+    EmbLayerNormPluginDynamicCreator();
 
     const char* getPluginName() const override;
 
     const char* getPluginVersion() const override;
 
-    const PluginFieldCollection* getFieldNames() override;
+    const nvinfer1::PluginFieldCollection* getFieldNames() override;
 
-    IPluginV2* createPlugin(const char* name, const PluginFieldCollection* fc) override;
+    nvinfer1::IPluginV2* createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc) override;
 
-    IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) override;
+    nvinfer1::IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) override;
 
     void setPluginNamespace(const char* pluginNamespace) override;
 
     const char* getPluginNamespace() const override;
 
 private:
-    static PluginFieldCollection mFC;
-    static std::vector<PluginField> mPluginAttributes;
+    static nvinfer1::PluginFieldCollection mFC;
+    static std::vector<nvinfer1::PluginField> mPluginAttributes;
     std::string mNamespace;
 };
 }
+}
 #endif // TRT_EMB_LAYER_NORM_PLUGIN_H
diff --git a/demo/BERT/plugins/geluPlugin.cu b/demo/BERT/plugins/geluPlugin.cu
index 68f4b73d..5ce7e2c4 100644
--- a/demo/BERT/plugins/geluPlugin.cu
+++ b/demo/BERT/plugins/geluPlugin.cu
@@ -14,24 +14,27 @@
  * limitations under the License.
  */
 
+#include <cassert>
+#include <cstring>
+#include <vector>
+
 #include "NvInfer.h"
 #include "geluPlugin.h"
 #include "pluginKernels.h"
-
+#include "common.h"
 #include "logger.h"
-#include <cassert>
-#include <cstring>
-#include <vector>
+
+using namespace nvinfer1;
 
 namespace bert
 {
-////// CUDA KERNELS ///////////////////////
+
+namespace test
+{
 
 // constants for approximating the normal cdf
 constexpr float A = 0.5;
-
 constexpr float B = 0.7978845608028654; // sqrt(2.0/M_PI)
-
 constexpr float C = 0.035677408136300125; // 0.044715 * sqrt(2.0/M_PI)
 
 template <typename T, unsigned TPB>
@@ -48,7 +51,7 @@ __global__ void geluKernel(const T a, const T b, const T c, int n, const T* inpu
     }
 }
 
-int computeGelu(cudaStream_t stream, int n, const float* input, float* output)
+inline int computeGelu(cudaStream_t stream, int n, const float* input, float* output)
 {
 
     constexpr int blockSize = 256;
@@ -59,7 +62,7 @@ int computeGelu(cudaStream_t stream, int n, const float* input, float* output)
     return 0;
 }
 
-int computeGelu(cudaStream_t stream, int n, const half* input, half* output)
+inline int computeGelu(cudaStream_t stream, int n, const half* input, half* output)
 {
     const int blockSize = 256;
 
@@ -85,73 +88,79 @@ int computeGelu(cudaStream_t stream, int n, const half* input, half* output)
     return 0;
 }
 
-////////////////////////////////////////////
-
-using namespace nvinfer1;
-
 namespace
 {
 static const char* GELU_PLUGIN_VERSION{"1"};
-static const char* GELU_PLUGIN_NAME{"CustomGeluPlugin"};
+static const char* GELU_PLUGIN_NAME{"CustomGeluPluginDynamic"};
 } // namespace
 
 // Static class fields initialization
-PluginFieldCollection GeluPluginCreator::mFC{};
-std::vector<PluginField> GeluPluginCreator::mPluginAttributes;
+PluginFieldCollection GeluPluginDynamicCreator::mFC{};
+std::vector<PluginField> GeluPluginDynamicCreator::mPluginAttributes;
 
-REGISTER_TENSORRT_PLUGIN(GeluPluginCreator);
+REGISTER_TENSORRT_PLUGIN(GeluPluginDynamicCreator);
 
-GeluPlugin::GeluPlugin(const std::string name)
+GeluPluginDynamic::GeluPluginDynamic(const std::string name)
     : mLayerName(name)
 {
 }
 
-GeluPlugin::GeluPlugin(const std::string name, const void* data, size_t length)
+GeluPluginDynamic::GeluPluginDynamic(const std::string name, const void* data, size_t length)
     : mLayerName(name)
 {
 
     gLogVerbose << "Gelu Deser start" << std::endl;
     const char* d = static_cast<const char*>(data);
     const char* a = d;
-    mInputVolume = readFromBuffer<decltype(mInputVolume)>(d);
     mType = readFromBuffer<DataType>(d);
     assert(d == a + length);
     gLogVerbose << "Gelu Deser done" << std::endl;
 }
-
-const char* GeluPlugin::getPluginType() const
+// IPluginV2DynamicExt Methods
+nvinfer1::IPluginV2DynamicExt* GeluPluginDynamic::clone() const
 {
-    return GELU_PLUGIN_NAME;
+    return new GeluPluginDynamic(mLayerName);
 }
 
-const char* GeluPlugin::getPluginVersion() const
+nvinfer1::DimsExprs GeluPluginDynamic::getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, nvinfer1::IExprBuilder& exprBuilder)
 {
-    return GELU_PLUGIN_VERSION;
+    return inputs[0];
 }
 
-int GeluPlugin::getNbOutputs() const
+bool GeluPluginDynamic::supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs)
 {
-    return 1;
+
+    const PluginTensorDesc& input = inOut[0];
+    if (pos == 0)
+    {
+        return (input.type == DataType::kFLOAT || input.type == DataType::kHALF)
+            && (input.format == TensorFormat::kLINEAR);
+    }
+    if (pos == 1)
+    {
+        const PluginTensorDesc& output = inOut[1];
+        return (input.type == output.type) && (output.format == TensorFormat::kLINEAR);
+    }
+    return false;
 }
 
-Dims GeluPlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims)
+void GeluPluginDynamic::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs,
+    const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs)
 {
-    // Validate input arguments
-    assert(nbInputDims == 1);
-    assert(index == 0);
-
-    // doesn't change input dimension, so output Dims will be the same as
-    // input Dims
-    return *inputs;
+    mType = in[0].desc.type;
 }
 
-int GeluPlugin::initialize()
+size_t GeluPluginDynamic::getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs,
+    const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const
 {
     return 0;
 }
-
-int GeluPlugin::enqueue(int batchSize, const void* const* inputs, void** outputs, void*, cudaStream_t stream)
+int GeluPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+    const nvinfer1::PluginTensorDesc* outputDesc, const void* const* inputs, void* const* outputs, void* workspace,
+    cudaStream_t stream)
 {
+
+    const int inputVolume = samplesCommon::volume(inputDesc[0].dims);
     int status = -1;
 
     // Our plugin outputs only one tensor
@@ -160,13 +169,13 @@ int GeluPlugin::enqueue(int batchSize, const void* const* inputs, void** outputs
     {
         const float* input = static_cast<const float*>(inputs[0]);
         float* output = static_cast<float*>(outputs[0]);
-        status = computeGelu(stream, mInputVolume * batchSize, input, output);
+        status = computeGelu(stream, inputVolume, input, output);
     }
     else if (mType == DataType::kHALF)
     {
         const half* input = static_cast<const half*>(inputs[0]);
         half* output = static_cast<half*>(outputs[0]);
-        status = computeGelu(stream, mInputVolume * batchSize, input, output);
+        status = computeGelu(stream, inputVolume, input, output);
     }
     else
     {
@@ -176,69 +185,69 @@ int GeluPlugin::enqueue(int batchSize, const void* const* inputs, void** outputs
     return status;
 }
 
-size_t GeluPlugin::getSerializationSize() const
+// IPluginV2Ext Methods
+nvinfer1::DataType GeluPluginDynamic::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const
 {
-    return sizeof(mInputVolume) + sizeof(DataType);
+    assert(index == 0);
+    assert(inputTypes[0] == DataType::kFLOAT || inputTypes[0] == DataType::kHALF);
+    return inputTypes[0];
 }
 
-void GeluPlugin::serialize(void* buffer) const
+// IPluginV2 Methods
+
+const char* GeluPluginDynamic::getPluginType() const
 {
-    char *d = static_cast<char*>(buffer), *a = d;
-    writeToBuffer(d, mInputVolume);
-    writeToBuffer(d, mType);
-    assert(d == a + getSerializationSize());
+    return GELU_PLUGIN_NAME;
 }
 
-void GeluPlugin::configureWithFormat(
-    const Dims* inputs, int nbInputs, const Dims* outputs, int nbOutputs, DataType type, PluginFormat format, int)
+const char* GeluPluginDynamic::getPluginVersion() const
 {
+    return GELU_PLUGIN_VERSION;
+}
 
-    // Validate input arguments
-    assert(nbOutputs == 1);
-    assert(format == PluginFormat::kNCHW);
-
-    // Fetch volume for future enqueue() operations
-    size_t volume = 1;
-    for (int i = 0; i < inputs->nbDims; i++)
-    {
-        volume *= inputs->d[i];
-    }
-    mInputVolume = volume;
-    mType = type;
+int GeluPluginDynamic::getNbOutputs() const
+{
+    return 1;
 }
 
-bool GeluPlugin::supportsFormat(DataType type, PluginFormat format) const
+int GeluPluginDynamic::initialize()
 {
-    if (type == DataType::kFLOAT || type == DataType::kHALF)
-        return format == PluginFormat::kNCHW;
-    else
-        return false;
+    return 0;
 }
 
-void GeluPlugin::terminate() {}
+void GeluPluginDynamic::terminate() {}
 
-void GeluPlugin::destroy()
+size_t GeluPluginDynamic::getSerializationSize() const
 {
-    // This gets called when the network containing plugin is destroyed
-    delete this;
+    return sizeof(DataType);
+}
+
+void GeluPluginDynamic::serialize(void* buffer) const
+{
+    char *d = static_cast<char*>(buffer), *a = d;
+    writeToBuffer(d, mType);
+    assert(d == a + getSerializationSize());
 }
 
-IPluginV2* GeluPlugin::clone() const
+void GeluPluginDynamic::destroy()
 {
-    return new GeluPlugin(mLayerName);
+    // This gets called when the network containing plugin is destroyed
+    delete this;
 }
 
-void GeluPlugin::setPluginNamespace(const char* libNamespace)
+void GeluPluginDynamic::setPluginNamespace(const char* libNamespace)
 {
     mNamespace = libNamespace;
 }
 
-const char* GeluPlugin::getPluginNamespace() const
+const char* GeluPluginDynamic::getPluginNamespace() const
 {
     return mNamespace.c_str();
 }
 
-GeluPluginCreator::GeluPluginCreator()
+///////////////
+
+GeluPluginDynamicCreator::GeluPluginDynamicCreator()
 {
 
     // Fill PluginFieldCollection with PluginField arguments metadata
@@ -246,42 +255,43 @@ GeluPluginCreator::GeluPluginCreator()
     mFC.fields = mPluginAttributes.data();
 }
 
-const char* GeluPluginCreator::getPluginName() const
+const char* GeluPluginDynamicCreator::getPluginName() const
 {
     return GELU_PLUGIN_NAME;
 }
 
-const char* GeluPluginCreator::getPluginVersion() const
+const char* GeluPluginDynamicCreator::getPluginVersion() const
 {
     return GELU_PLUGIN_VERSION;
 }
 
-const PluginFieldCollection* GeluPluginCreator::getFieldNames()
+const PluginFieldCollection* GeluPluginDynamicCreator::getFieldNames()
 {
     return &mFC;
 }
 
-IPluginV2* GeluPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc)
+IPluginV2* GeluPluginDynamicCreator::createPlugin(const char* name, const PluginFieldCollection* fc)
 {
-    gLogVerbose << "Creating GeluPlugin...\n";
-    GeluPlugin* p = new GeluPlugin(name);
+    gLogVerbose << "Creating GeluPluginDynamic...\n";
+    GeluPluginDynamic* p = new GeluPluginDynamic(name);
     return p;
 }
 
-IPluginV2* GeluPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength)
+IPluginV2* GeluPluginDynamicCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength)
 {
     // This object will be deleted when the network is destroyed, which will
-    // call GeluPlugin::destroy()
-    return new GeluPlugin(name, serialData, serialLength);
+    // call GeluPluginDynamic::destroy()
+    return new GeluPluginDynamic(name, serialData, serialLength);
 }
 
-void GeluPluginCreator::setPluginNamespace(const char* libNamespace)
+void GeluPluginDynamicCreator::setPluginNamespace(const char* libNamespace)
 {
     mNamespace = libNamespace;
 }
 
-const char* GeluPluginCreator::getPluginNamespace() const
+const char* GeluPluginDynamicCreator::getPluginNamespace() const
 {
     return mNamespace.c_str();
 }
 }
+}
diff --git a/demo/BERT/plugins/geluPlugin.h b/demo/BERT/plugins/geluPlugin.h
index 46164c5d..2cf06dec 100644
--- a/demo/BERT/plugins/geluPlugin.h
+++ b/demo/BERT/plugins/geluPlugin.h
@@ -24,91 +24,83 @@
 namespace bert
 {
 
-using namespace nvinfer1;
+namespace test
+{
 
 // One of the preferred ways of making TensorRT to be able to see
 // our custom layer requires extending IPluginV2 and IPluginCreator classes.
 // For requirements for overriden functions, check TensorRT API docs.
 
-class GeluPlugin : public IPluginV2
+class GeluPluginDynamic : public nvinfer1::IPluginV2DynamicExt
 {
 public:
-    GeluPlugin(const std::string name);
+    GeluPluginDynamic(const std::string name);
 
-    GeluPlugin(const std::string name, const void* data, size_t length);
+    GeluPluginDynamic(const std::string name, const void* data, size_t length);
 
-    // It doesn't make sense to make GeluPlugin without arguments, so we delete
+    // It doesn't make sense to make GeluPluginDynamic without arguments, so we delete
     // default constructor.
-    GeluPlugin() = delete;
-
+    GeluPluginDynamic() = delete;
+
+    // IPluginV2DynamicExt Methods
+    nvinfer1::IPluginV2DynamicExt* clone() const override;
+    nvinfer1::DimsExprs getOutputDimensions(
+        int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, nvinfer1::IExprBuilder& exprBuilder) override;
+    bool supportsFormatCombination(
+        int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) override;
+    void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs,
+        const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) override;
+    size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs,
+        const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const override;
+    int enqueue(const nvinfer1::PluginTensorDesc* inputDesc, const nvinfer1::PluginTensorDesc* outputDesc,
+        const void* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) override;
+
+    // IPluginV2Ext Methods
+    nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const override;
+
+    // IPluginV2 Methods
+    const char* getPluginType() const override;
+    const char* getPluginVersion() const override;
     int getNbOutputs() const override;
-
-    Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override;
-
     int initialize() override;
-
     void terminate() override;
-
-    size_t getWorkspaceSize(int) const override
-    {
-        return 0;
-    };
-
-    int enqueue(
-        int batchSize, const void* const* inputs, void** outputs, void* workspace, cudaStream_t stream) override;
-
     size_t getSerializationSize() const override;
-
     void serialize(void* buffer) const override;
-
-    void configureWithFormat(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, DataType type,
-        PluginFormat format, int maxBatchSize) override;
-
-    bool supportsFormat(DataType type, PluginFormat format) const override;
-
-    const char* getPluginType() const override;
-
-    const char* getPluginVersion() const override;
-
     void destroy() override;
-
-    nvinfer1::IPluginV2* clone() const override;
-
     void setPluginNamespace(const char* pluginNamespace) override;
-
     const char* getPluginNamespace() const override;
 
 private:
     const std::string mLayerName;
-    size_t mInputVolume;
     std::string mNamespace;
 
-    DataType mType;
+    nvinfer1::DataType mType;
 };
 
-class GeluPluginCreator : public IPluginCreator
+class GeluPluginDynamicCreator : public nvinfer1::IPluginCreator
 {
 public:
-    GeluPluginCreator();
+    GeluPluginDynamicCreator();
 
     const char* getPluginName() const override;
 
     const char* getPluginVersion() const override;
 
-    const PluginFieldCollection* getFieldNames() override;
+    const nvinfer1::PluginFieldCollection* getFieldNames() override;
 
-    IPluginV2* createPlugin(const char* name, const PluginFieldCollection* fc) override;
+    nvinfer1::IPluginV2* createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc) override;
 
-    IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) override;
+    nvinfer1::IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) override;
 
     void setPluginNamespace(const char* pluginNamespace) override;
 
     const char* getPluginNamespace() const override;
 
 private:
-    static PluginFieldCollection mFC;
-    static std::vector<PluginField> mPluginAttributes;
+    static nvinfer1::PluginFieldCollection mFC;
+    static std::vector<nvinfer1::PluginField> mPluginAttributes;
     std::string mNamespace;
 };
 }
+}
 #endif // TRT_GELU_PLUGIN_H
diff --git a/demo/BERT/plugins/pluginUtil.h b/demo/BERT/plugins/pluginUtil.h
index 7e939fd0..edb5f9d2 100644
--- a/demo/BERT/plugins/pluginUtil.h
+++ b/demo/BERT/plugins/pluginUtil.h
@@ -19,20 +19,15 @@
 
 #include "cublas_v2.h"
 #include "cuda_fp16.h"
+#include "common.h"
+#include <cub/cub.cuh>
 
 namespace bert
 {
 
-#define CHECK(status)                                                                                                  \
-    do                                                                                                                 \
-    {                                                                                                                  \
-        auto ret = (status);                                                                                           \
-        if (ret != 0)                                                                                                  \
-        {                                                                                                              \
-            std::cout << "Cuda failure: " << ret << std::endl;                                                         \
-            abort();                                                                                                   \
-        }                                                                                                              \
-    } while (0)
+constexpr uint32_t BDIM = 0; // batch dimension
+constexpr uint32_t SDIM = 1; // seq len dimension
+constexpr uint32_t HDIM = 2; // hidden dimension
 
 #define DESER(d, m) m = readFromBuffer<decltype(m)>(d)
 
diff --git a/demo/BERT/plugins/qkvToContextPlugin.cu b/demo/BERT/plugins/qkvToContextPlugin.cu
index fff31fe5..e4c6d1ac 100644
--- a/demo/BERT/plugins/qkvToContextPlugin.cu
+++ b/demo/BERT/plugins/qkvToContextPlugin.cu
@@ -25,13 +25,15 @@
 #include <cstring>
 #include <half.h>
 #include <vector>
+#include <iostream>
+
+using namespace nvinfer1;
 
 namespace bert
 {
 
-using namespace nvinfer1;
-
-constexpr size_t kAlignment = 256;
+namespace test
+{
 
 template <typename T>
 __global__ void transposeCtx(const int H, const T* input, T* output)
@@ -59,7 +61,7 @@ __global__ void transposeCtx(const int H, const T* input, T* output)
     }
 }
 
-void launchTransCtx(cudaStream_t stream, const int S, const int B, const int headSize, const int numHeads,
+inline void launchTransCtx(cudaStream_t stream, const int S, const int B, const int headSize, const int numHeads,
     const float* input, float* output)
 {
 
@@ -81,7 +83,7 @@ void launchTransCtx(cudaStream_t stream, const int S, const int B, const int hea
     }
 }
 
-void launchTransCtx(cudaStream_t stream, const int S, const int B, const int headSize, const int numHeads,
+inline void launchTransCtx(cudaStream_t stream, const int S, const int B, const int headSize, const int numHeads,
     const half* input, half* output)
 {
     const dim3 grid(S, B, 1);
@@ -136,7 +138,7 @@ __global__ void transposeQKV(const int H, const T* input, T* output)
     }
 }
 
-void launchTransQkv(cudaStream_t stream, const int S, const int B, const int headSize, const int numHeads,
+inline void launchTransQkv(cudaStream_t stream, const int S, const int B, const int headSize, const int numHeads,
     const float* input, float* output)
 {
 
@@ -157,7 +159,7 @@ void launchTransQkv(cudaStream_t stream, const int S, const int B, const int hea
     CHECK(cudaPeekAtLastError());
 }
 
-void launchTransQkv(cudaStream_t stream, const int S, const int B, const int headSize, const int numHeads,
+inline void launchTransQkv(cudaStream_t stream, const int S, const int B, const int headSize, const int numHeads,
     const half* input, half* output)
 {
     const dim3 grid(S, B, 3);
@@ -186,7 +188,7 @@ void launchTransQkv(cudaStream_t stream, const int S, const int B, const int hea
 }
 
 template <typename T>
-int qkvToCtx(cublasHandle_t& cublas, const int B, const int S, const int numHeads, const int headSize,
+inline int qkvToCtx(cublasHandle_t& cublas, const int B, const int S, const int numHeads, const int headSize,
     const float rsqrtHeadSize, const T* input, T* output, T* qkptr, T* pptr, T* tptr, cudaStream_t stream,
     const int* maskIdx = nullptr)
 {
@@ -234,22 +236,25 @@ int qkvToCtx(cublasHandle_t& cublas, const int B, const int S, const int numHead
 
 namespace
 {
-static const char* QKVToCONTEXT_PLUGIN_VERSION{"1"};
-static const char* QKVToCONTEXT_PLUGIN_NAME{"CustomQKVToContextPlugin"};
+static const char* QKV_TO_CONTEXT_PLUGIN_VERSION{"1"};
+static const char* QKV_TO_CONTEXT_PLUGIN_NAME{"CustomQKVToContextPluginDynamic"};
 } // namespace
 
 // Static class fields initialization
-PluginFieldCollection QKVToContextPluginCreator::mFC{};
-std::vector<PluginField> QKVToContextPluginCreator::mPluginAttributes;
+PluginFieldCollection QKVToContextPluginDynamicCreator::mFC{};
+std::vector<PluginField> QKVToContextPluginDynamicCreator::mPluginAttributes;
 
-REGISTER_TENSORRT_PLUGIN(QKVToContextPluginCreator);
+REGISTER_TENSORRT_PLUGIN(QKVToContextPluginDynamicCreator);
 
-QKVToContextPlugin::QKVToContextPlugin(
-    const std::string name, const int hiddenSize, const int numHeads, const int S, bool hasImask)
+constexpr size_t kAlignment = 256;
+constexpr uint32_t IIDX = 0; // index of the input tensor
+constexpr uint32_t MIDX = 1; // index of the mask
+
+
+QKVToContextPluginDynamic::QKVToContextPluginDynamic(const std::string name, const int hiddenSize, const int numHeads, bool hasImask)
     : mLayerName(name)
     , mHiddenSize(hiddenSize)
     , mNumHeads(numHeads)
-    , mS(S)
     , mHasImask(hasImask)
 {
     assert(hiddenSize % numHeads == 0);
@@ -257,7 +262,7 @@ QKVToContextPlugin::QKVToContextPlugin(
     mRsqrtHeadSize = 1.f / sqrt(float(mHeadSize));
 }
 
-QKVToContextPlugin::QKVToContextPlugin(const std::string name, const void* data, size_t length)
+QKVToContextPluginDynamic::QKVToContextPluginDynamic(const std::string name, const void* data, size_t length)
     : mLayerName(name)
 {
 
@@ -268,295 +273,313 @@ QKVToContextPlugin::QKVToContextPlugin(const std::string name, const void* data,
     gLogVerbose << "QKV Deser Start" << std::endl;
 
     DESER(d, mType);
-    DESER(d, mS);
     DESER(d, mNumHeads);
     DESER(d, mHeadSize);
     DESER(d, mRsqrtHeadSize);
     DESER(d, mHasImask);
+    DESER(d, mHiddenSize);
 
     gLogVerbose << "QKV Deser done" << std::endl;
 
     assert(d == (a + length));
 }
 
-const char* QKVToContextPlugin::getPluginType() const
+// IPluginV2DynamicExt Methods
+nvinfer1::IPluginV2DynamicExt* QKVToContextPluginDynamic::clone() const
 {
-    return QKVToCONTEXT_PLUGIN_NAME;
+    gLogVerbose << "QKV Clone" << std::endl;
+    auto ret = new QKVToContextPluginDynamic(mLayerName, mHiddenSize, mNumHeads,  mHasImask);
+    ret->mType = mType;
+    ret->initialize();
+    gLogVerbose << "QKV Clone done" << std::endl;
+    return ret;
 }
 
-const char* QKVToContextPlugin::getPluginVersion() const
+DimsExprs QKVToContextPluginDynamic::getOutputDimensions(int outputIndex, const DimsExprs* inputs, int nbInputs, IExprBuilder& exprBuilder)
 {
-    return QKVToCONTEXT_PLUGIN_VERSION;
+    // Input is BxSx3*N*H, output should be BxSxN*H
+    assert(outputIndex == 0);
+    // Copy over everything
+    DimsExprs output(inputs[IIDX]);
+    // Divide last dim by three
+    auto three = exprBuilder.constant(3);
+    output.d[HDIM] = exprBuilder.operation(DimensionOperation::kFLOOR_DIV, *inputs[IIDX].d[HDIM], *three);
+    return output;
 }
-
-int QKVToContextPlugin::getNbOutputs() const
+bool QKVToContextPluginDynamic::supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs)
 {
-    return 1;
+    assert(pos >= 0);
+    assert(pos < 2 + mHasImask);
+    assert(nbInputs == 1 + mHasImask);
+    const auto* in = inOut;
+    const auto* out = inOut + nbInputs;
+    if (pos == 0)
+    {
+        // must not check descriptions > pos
+        return (in->type == DataType::kFLOAT || in->type == DataType::kHALF) && // precision
+            (in->format == TensorFormat::kLINEAR) &&                            // format
+            (in->dims.nbDims == 5) &&                                           // num dims
+            ((in->dims.d[HDIM] % 3) == 0) &&                                    // see getOutputDimensions
+            ((in->dims.d[3]) == 1) &&                                           // for fc
+            ((in->dims.d[4]) == 1)                                              // for fc
+            ;
+    }
+    else
+    { // pos==1
+        if ((mHasImask && pos == 1))
+        {
+            const auto* inMask = &inOut[1];
+            return (inMask->type == DataType::kINT32) &&     // precision
+                (inMask->format == TensorFormat::kLINEAR) && // format
+                (inMask->dims.nbDims == 1) &&                // num dims
+                ((inMask->dims.d[BDIM]) == in->dims.d[BDIM]) // check B
+                ;
+        }
+        if (!mHasImask || (pos == 2))
+        {
+            return (in->type == out->type) &&                      // precision
+                (out->format == TensorFormat::kLINEAR) &&          // format
+                (out->dims.nbDims == 5) &&                         // num dims
+                ((in->dims.d[HDIM] / 3) == (out->dims.d[HDIM])) && // div 3
+                ((out->dims.d[3]) == 1) &&                         // for fc
+                ((out->dims.d[4]) == 1) &&                         // for fc
+                ((out->dims.d[BDIM]) == in->dims.d[BDIM]) &&       // check B
+                ((out->dims.d[SDIM]) == in->dims.d[SDIM])          // check S
+                ;
+        }
+    }
+    return false;
 }
-
-Dims QKVToContextPlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims)
+void QKVToContextPluginDynamic::configurePlugin(const DynamicPluginTensorDesc* in, int nbInputs, const DynamicPluginTensorDesc* out, int nbOutputs)
 {
-    // Validate input arguments
-    assert(nbInputDims == 1 + mHasImask);
-    assert(index == 0);
-
-    return Dims4{mS, mNumHeads * mHeadSize, 1, 1};
+    assert(nbInputs == 1 + mHasImask);
+    assert(nbOutputs == 1);
+    const PluginTensorDesc& inDesc = in[IIDX].desc;
+    const PluginTensorDesc& outDesc = out->desc;
+    mType = inDesc.type;
+    assert(mType == outDesc.type);
+    assert(inDesc.dims.d[BDIM] == outDesc.dims.d[BDIM]);
+    assert(inDesc.dims.d[SDIM] == outDesc.dims.d[SDIM]);
+    assert(inDesc.dims.d[HDIM] == 3 * outDesc.dims.d[HDIM]);
+    if (mHasImask)
+    {
+        const PluginTensorDesc& maskDesc = in[MIDX].desc;
+        assert(maskDesc.type == DataType::kINT32);
+        assert(maskDesc.dims.d[0] == inDesc.dims.d[BDIM]);
+    }
 }
 
-void QKVToContextPlugin::attachToContext(cudnnContext* cudnn, cublasContext* cublas_, IGpuAllocator* alloc)
+size_t QKVToContextPluginDynamic::scratchSize(const int B, const int S) const
 {
-    gLogVerbose << "QKV AttachToContext" << std::endl;
+    size_t wordSize = samplesCommon::getElementSize(mType);
+    const size_t len = B * mNumHeads * S * S;
+    const size_t bytes = len * wordSize;
+
+    return bytes;
 }
 
-int QKVToContextPlugin::initialize()
+size_t QKVToContextPluginDynamic::getWorkspaceSize(const PluginTensorDesc* inputs, int nbInputs, const PluginTensorDesc* outputs, int nbOutputs) const
 {
-    gLogVerbose << "QKV Initialize" << std::endl;
-    cublasCreate(&cublas);
+    const int B = inputs->dims.d[BDIM];
+    const int S = inputs->dims.d[SDIM];
+    const size_t bytes = scratchSize(B, S);
+    const size_t bytesAligned = alignTo<size_t>(bytes, kAlignment);
+    const size_t two = 2;
+    const size_t ws = two * bytesAligned;
 
-    return 0;
+    const size_t wordSize = samplesCommon::getElementSize(mType);
+    const size_t tp = 3 * B * S * mNumHeads * mHeadSize * wordSize;
+
+    return ws + tp;
 }
 
-int QKVToContextPlugin::enqueue(
-    int batchSize, const void* const* inputs, void** outputs, void* workspace, cudaStream_t stream)
+// IPluginV2Ext Methods
+DataType QKVToContextPluginDynamic::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const
 {
+    assert(index == 0);
+    assert(inputTypes[0] == DataType::kFLOAT || inputTypes[0] == DataType::kHALF);
+    return inputTypes[0];
+}
 
-    const size_t bytesAligned = alignTo<size_t>(scratchSize(batchSize), kAlignment);
-    char* scratchBytes = reinterpret_cast<char*>(workspace);
-
-    char* scratch1 = scratchBytes;
-    char* scratch2 = scratchBytes + bytesAligned;
-    char* scratch3 = scratch2 + bytesAligned;
-
-    const int* maskIdx = mHasImask ? static_cast<const int*>(inputs[1]) : nullptr;
+// IPluginV2 Methods
+const char* QKVToContextPluginDynamic::getPluginType() const
+{
+    return QKV_TO_CONTEXT_PLUGIN_NAME;
+}
 
-    int status = -1;
-    if (mType == DataType::kFLOAT)
-    {
-        const float* input = static_cast<const float*>(inputs[0]);
-        float* output = static_cast<float*>(outputs[0]);
-        float* scr1 = reinterpret_cast<float*>(scratch1);
-        float* scr2 = reinterpret_cast<float*>(scratch2);
-        float* scr3 = reinterpret_cast<float*>(scratch3);
+const char* QKVToContextPluginDynamic::getPluginVersion() const
+{
+    return QKV_TO_CONTEXT_PLUGIN_VERSION;
+}
 
-        status = qkvToCtx(cublas, batchSize, mS, mNumHeads, mHeadSize, mRsqrtHeadSize, input, output, scr1, scr2, scr3,
-            stream, maskIdx);
-    }
-    else if (mType == DataType::kHALF)
-    {
-        const half* input = static_cast<const half*>(inputs[0]);
-        half* output = static_cast<half*>(outputs[0]);
-        half* scr1 = reinterpret_cast<half*>(scratch1);
-        half* scr2 = reinterpret_cast<half*>(scratch2);
-        half* scr3 = reinterpret_cast<half*>(scratch3);
+int QKVToContextPluginDynamic::getNbOutputs() const
+{
+    return 1;
+}
 
-        status = qkvToCtx(cublas, batchSize, mS, mNumHeads, mHeadSize, mRsqrtHeadSize, input, output, scr1, scr2, scr3,
-            stream, maskIdx);
-    }
-    else
-    {
-        assert(false);
-    }
+int QKVToContextPluginDynamic::initialize()
+{
+    cublasCreate(&cublas);
+    return 0;
+}
 
-    return status;
+void QKVToContextPluginDynamic::terminate()
+{
+    CHECK(cublasDestroy(cublas));
 }
 
-size_t QKVToContextPlugin::getSerializationSize() const
+size_t QKVToContextPluginDynamic::getSerializationSize() const
 {
-    return sizeof(mNumHeads) + sizeof(mS) + sizeof(mHeadSize) + sizeof(DataType) + sizeof(mRsqrtHeadSize)
-        + sizeof(mHasImask);
+    return sizeof(mNumHeads) +  sizeof(mHeadSize) + sizeof(DataType) + sizeof(mRsqrtHeadSize)
+        + sizeof(mHasImask) + sizeof(mHiddenSize);
 }
 
-void QKVToContextPlugin::serialize(void* buffer) const
+void QKVToContextPluginDynamic::serialize(void* buffer) const
 {
     char* d = static_cast<char*>(buffer);
     const char* a = d;
 
     writeToBuffer(d, mType);
-    writeToBuffer(d, mS);
     writeToBuffer(d, mNumHeads);
     writeToBuffer(d, mHeadSize);
     writeToBuffer(d, mRsqrtHeadSize);
     writeToBuffer(d, mHasImask);
+    writeToBuffer(d, mHiddenSize);
 
     assert(d == a + getSerializationSize());
 }
 
-DataType QKVToContextPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const
+void QKVToContextPluginDynamic::destroy()
 {
-    DataType type = inputTypes[0];
-    if (type == DataType::kFLOAT || type == DataType::kHALF)
-    {
-        return type;
-    }
-    type = DataType::kFLOAT;
-    return type;
+    delete this;
 }
 
-void QKVToContextPlugin::configurePlugin(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs,
-    const DataType* inputTypes, const DataType* outputTypes, const bool* inputIsBroadcast,
-    const bool* outputIsBroadcast, PluginFormat floatFormat, int maxBatchSize)
+void QKVToContextPluginDynamic::setPluginNamespace(const char* libNamespace)
 {
-    // Validate input arguments
-    assert(nbInputs == 1 + mHasImask);
-    assert(nbOutputs == 1);
-
-    assert(inputDims[0].nbDims == 4);
-    assert(inputDims[0].d[0] == mS);
-    assert(inputDims[0].d[1] == 3 * mHeadSize * mNumHeads);
-    assert(inputDims[0].d[2] == 1);
-    assert(inputDims[0].d[3] == 1);
-
-    assert(outputDims[0].nbDims == 4);
-    assert(outputDims[0].d[0] == mS);
-    assert(outputDims[0].d[1] == mNumHeads * mHeadSize);
-    assert(outputDims[0].d[2] == 1);
-    assert(outputDims[0].d[3] == 1);
-    mType = outputTypes[0];
-    if (!(mType == DataType::kHALF || mType == DataType::kFLOAT))
-        mType = DataType::kFLOAT;
-    if (mHasImask)
-    {
-        assert(inputTypes[1] == DataType::kINT32);
-    }
-}
-
-bool QKVToContextPlugin::supportsFormat(DataType type, PluginFormat format) const
-{
-    if (type == DataType::kFLOAT || type == DataType::kHALF || type == DataType::kINT32)
-    {
-        return format == PluginFormat::kNCHW;
-    }
-    else
-    {
-        return false;
-    }
+    mNamespace = libNamespace;
 }
 
-void QKVToContextPlugin::terminate()
+const char* QKVToContextPluginDynamic::getPluginNamespace() const
 {
-    gLogVerbose << "QKV Terminate " << std::endl;
-    CHECK(cublasDestroy(cublas));
-    gLogVerbose << "QKV Terminate done" << std::endl;
+    return mNamespace.c_str();
 }
 
-size_t QKVToContextPlugin::scratchSize(int batchsize) const
+int QKVToContextPluginDynamic::enqueue(const PluginTensorDesc* inputDesc, const PluginTensorDesc* outputDesc,
+    const void* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream)
 {
-    size_t wordSize = samplesCommon::getElementSize(mType);
-    const size_t len = batchsize * mNumHeads * mS * mS;
-    const size_t bytes = len * wordSize;
 
-    return bytes;
-}
+    const int batchSize = inputDesc->dims.d[BDIM];
+    const int S = inputDesc->dims.d[SDIM];
 
-size_t QKVToContextPlugin::getWorkspaceSize(int batchsize) const
-{
-    const size_t bytes = scratchSize(batchsize);
-    const size_t bytesAligned = alignTo<size_t>(bytes, kAlignment);
-    const size_t two = 2;
-    const size_t ws = two * bytesAligned;
+    const size_t bytesAligned = alignTo<size_t>(scratchSize(batchSize, S), kAlignment);
+    char* scratchBytes = reinterpret_cast<char*>(workspace);
 
-    size_t wordSize = samplesCommon::getElementSize(mType);
-    const size_t tp = 3 * batchsize * mS * mNumHeads * mHeadSize * wordSize;
+    char* scratch1 = scratchBytes;
+    char* scratch2 = scratchBytes + bytesAligned;
+    char* scratch3 = scratch2 + bytesAligned;
 
-    return ws + tp;
-}
+    const int* maskIdx = mHasImask ? static_cast<const int*>(inputs[1]) : nullptr;
 
-void QKVToContextPlugin::destroy() {}
+    int status = -1;
+    if (mType == DataType::kFLOAT)
+    {
+        const float* input = static_cast<const float*>(inputs[0]);
+        float* output = static_cast<float*>(outputs[0]);
+        float* scr1 = reinterpret_cast<float*>(scratch1);
+        float* scr2 = reinterpret_cast<float*>(scratch2);
+        float* scr3 = reinterpret_cast<float*>(scratch3);
 
-IPluginV2Ext* QKVToContextPlugin::clone() const
-{
-    gLogVerbose << "QKV Clone" << std::endl;
-    auto ret = new QKVToContextPlugin(mLayerName, mHiddenSize, mNumHeads, mS, mHasImask);
-    ret->mType = mType;
-    ret->initialize();
-    gLogVerbose << "QKV Clone done" << std::endl;
-    return ret;
-}
+        status = qkvToCtx(cublas, batchSize, S, mNumHeads, mHeadSize, mRsqrtHeadSize, input, output, scr1, scr2, scr3,
+            stream, maskIdx);
+    }
+    else if (mType == DataType::kHALF)
+    {
+        const half* input = static_cast<const half*>(inputs[0]);
+        half* output = static_cast<half*>(outputs[0]);
+        half* scr1 = reinterpret_cast<half*>(scratch1);
+        half* scr2 = reinterpret_cast<half*>(scratch2);
+        half* scr3 = reinterpret_cast<half*>(scratch3);
 
-void QKVToContextPlugin::setPluginNamespace(const char* libNamespace)
-{
-    mNamespace = libNamespace;
-}
+        status = qkvToCtx(cublas, batchSize, S, mNumHeads, mHeadSize, mRsqrtHeadSize, input, output, scr1, scr2, scr3,
+            stream, maskIdx);
+    }
+    else
+    {
+        assert(false);
+    }
 
-const char* QKVToContextPlugin::getPluginNamespace() const
-{
-    return mNamespace.c_str();
+    return status;
 }
 
-QKVToContextPluginCreator::QKVToContextPluginCreator()
+QKVToContextPluginDynamicCreator::QKVToContextPluginDynamicCreator()
 {
     mFC.nbFields = mPluginAttributes.size();
     mFC.fields = mPluginAttributes.data();
 }
 
-const char* QKVToContextPluginCreator::getPluginName() const
+const char* QKVToContextPluginDynamicCreator::getPluginName() const
 {
-    return QKVToCONTEXT_PLUGIN_NAME;
+    return QKV_TO_CONTEXT_PLUGIN_NAME;
 }
 
-const char* QKVToContextPluginCreator::getPluginVersion() const
+const char* QKVToContextPluginDynamicCreator::getPluginVersion() const
 {
-    return QKVToCONTEXT_PLUGIN_VERSION;
+    return QKV_TO_CONTEXT_PLUGIN_VERSION;
 }
 
-const PluginFieldCollection* QKVToContextPluginCreator::getFieldNames()
+const PluginFieldCollection* QKVToContextPluginDynamicCreator::getFieldNames()
 {
     return &mFC;
 }
 
-IPluginV2* QKVToContextPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc)
+IPluginV2* QKVToContextPluginDynamicCreator::createPlugin(const char* name, const PluginFieldCollection* fc)
 {
     gLogVerbose << "Creating QKV2ContextPlugin...\n";
 
     int hidden_size;
     int num_heads;
-    int S;
     bool has_mask;
 
-    for(int i=0; i< fc->nbFields; i++)
+    for (int i = 0; i < fc->nbFields; i++)
     {
         std::string field_name(fc->fields[i].name);
-        if (field_name.compare("hidden_size")==0)
+        if (field_name.compare("hidden_size") == 0)
         {
             hidden_size = *static_cast<const int*>(fc->fields[i].data);
             gLogVerbose << "Building hidden_size: " << hidden_size << std::endl;
         }
-        if (field_name.compare("num_heads")==0)
+        if (field_name.compare("num_heads") == 0)
         {
-            num_heads =  *static_cast<const int*>(fc->fields[i].data);
+            num_heads = *static_cast<const int*>(fc->fields[i].data);
             gLogVerbose << "Building num_heads: " << num_heads << std::endl;
         }
-        if (field_name.compare("S")==0)
+        if (field_name.compare("has_mask") == 0)
         {
-            S =  *static_cast<const int*>(fc->fields[i].data);
-            gLogVerbose << "Building S: " << S << std::endl;
-        }
-        if (field_name.compare("has_mask")==0)
-        {
-            has_mask =  *static_cast<const bool*>(fc->fields[i].data);
-            gLogVerbose << "Building has_mask: " << has_mask  << std::endl;
+            has_mask = *static_cast<const bool*>(fc->fields[i].data);
+            gLogVerbose << "Building has_mask: " << has_mask << std::endl;
         }
     }
 
     gLogVerbose << "Building the Plugin...\n";
-    QKVToContextPlugin* p =  new QKVToContextPlugin(name, hidden_size, num_heads, S, has_mask);
+    QKVToContextPluginDynamic* p = new QKVToContextPluginDynamic(name, hidden_size, num_heads, has_mask);
     return p;
 }
 
-IPluginV2* QKVToContextPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength)
+IPluginV2* QKVToContextPluginDynamicCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength)
 {
     // This object will be deleted when the network is destroyed, which will
-    // call QKVToContextPlugin::destroy()
-    return new QKVToContextPlugin(name, serialData, serialLength);
+    // call QKVToContextPluginDynamic::destroy()
+    return new QKVToContextPluginDynamic(name, serialData, serialLength);
 }
 
-void QKVToContextPluginCreator::setPluginNamespace(const char* libNamespace)
+void QKVToContextPluginDynamicCreator::setPluginNamespace(const char* libNamespace)
 {
     mNamespace = libNamespace;
 }
 
-const char* QKVToContextPluginCreator::getPluginNamespace() const
+const char* QKVToContextPluginDynamicCreator::getPluginNamespace() const
 {
     return mNamespace.c_str();
 }
 }
+}
diff --git a/demo/BERT/plugins/qkvToContextPlugin.h b/demo/BERT/plugins/qkvToContextPlugin.h
index 3aa3bedb..b199332d 100644
--- a/demo/BERT/plugins/qkvToContextPlugin.h
+++ b/demo/BERT/plugins/qkvToContextPlugin.h
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#ifndef TRT_QKV_TO_CONTEXT_PLUGIN_h
-#define TRT_QKV_TO_CONTEXT_PLUGIN_h
+#ifndef TRT_QKV_TO_CONTEXT_PLUGIN_H
+#define TRT_QKV_TO_CONTEXT_PLUGIN_H
 
 #include "NvInferPlugin.h"
 #include "cublas_v2.h"
@@ -25,75 +25,55 @@
 namespace bert
 {
 
-using namespace nvinfer1;
+namespace test
+{
 
 // One of the preferred ways of making TensorRT to be able to see
 // our custom layer requires extending IPluginV2 and IPluginCreator classes.
 // For requirements for overriden functions, check TensorRT API docs.
 
-class QKVToContextPlugin : public IPluginV2Ext
+class QKVToContextPluginDynamic : public nvinfer1::IPluginV2DynamicExt
 {
 public:
-    QKVToContextPlugin(
-        const std::string name, const int hidden_size, const int num_heads, const int S, bool has_imask = false);
+    QKVToContextPluginDynamic(
+        const std::string name, const int hidden_size, const int num_heads,  bool has_imask = false);
 
-    QKVToContextPlugin(const std::string name, const void* data, size_t length);
+    QKVToContextPluginDynamic(const std::string name, const void* data, size_t length);
 
-    // It doesn't make sense to make QKVToContextPlugin without arguments, so we
+    // It doesn't make sense to make QKVToContextPluginDynamic without arguments, so we
     // delete default constructor.
-    QKVToContextPlugin() = delete;
-
-    bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const
-    {
-        return false;
-    }
-
-    bool canBroadcastInputAcrossBatch(int inputIndex) const
-    {
-        return false;
-    }
-
+    QKVToContextPluginDynamic() = delete;
+
+    // IPluginV2DynamicExt Methods
+    nvinfer1::IPluginV2DynamicExt* clone() const override;
+    nvinfer1::DimsExprs getOutputDimensions(
+        int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, nvinfer1::IExprBuilder& exprBuilder) override;
+    bool supportsFormatCombination(
+        int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) override;
+    void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs,
+        const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) override;
+    size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs,
+        const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const override;
+    int enqueue(const nvinfer1::PluginTensorDesc* inputDesc, const nvinfer1::PluginTensorDesc* outputDesc,
+        const void* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) override;
+
+    // IPluginV2Ext Methods
+    nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const override;
+
+    // IPluginV2 Methods
+    const char* getPluginType() const override;
+    const char* getPluginVersion() const override;
     int getNbOutputs() const override;
-
-    Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override;
-
     int initialize() override;
-
     void terminate() override;
-
-    size_t getWorkspaceSize(int) const override;
-
-    int enqueue(
-        int batchSize, const void* const* inputs, void** outputs, void* workspace, cudaStream_t stream) override;
-
     size_t getSerializationSize() const override;
-
     void serialize(void* buffer) const override;
-
-    DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const override;
-
-    void configurePlugin(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs,
-        const DataType* inputTypes, const DataType* outputTypes, const bool* inputIsBroadcast,
-        const bool* outputIsBroadcast, PluginFormat floatFormat, int maxBatchSize) override;
-
-    bool supportsFormat(DataType type, PluginFormat format) const override;
-
-    const char* getPluginType() const override;
-
-    const char* getPluginVersion() const override;
-
     void destroy() override;
-
-    nvinfer1::IPluginV2Ext* clone() const override;
-
     void setPluginNamespace(const char* pluginNamespace) override;
-
     const char* getPluginNamespace() const override;
 
-    void attachToContext(cudnnContext* cudnn, cublasContext* cublas, IGpuAllocator* alloc) override;
-
 private:
-    size_t scratchSize(int bs) const;
+    size_t scratchSize(const int B, const int S) const;
     float mRsqrtHeadSize;
     int mHeadSize;
     int mB;
@@ -104,33 +84,34 @@ class QKVToContextPlugin : public IPluginV2Ext
     const std::string mLayerName;
     std::string mNamespace;
 
-    DataType mType;
+    nvinfer1::DataType mType;
     cublasHandle_t cublas;
 };
 
-class QKVToContextPluginCreator : public IPluginCreator
+class QKVToContextPluginDynamicCreator : public nvinfer1::IPluginCreator
 {
 public:
-    QKVToContextPluginCreator();
+    QKVToContextPluginDynamicCreator();
 
     const char* getPluginName() const override;
 
     const char* getPluginVersion() const override;
 
-    const PluginFieldCollection* getFieldNames() override;
+    const nvinfer1::PluginFieldCollection* getFieldNames() override;
 
-    IPluginV2* createPlugin(const char* name, const PluginFieldCollection* fc) override;
+    nvinfer1::IPluginV2* createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc) override;
 
-    IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) override;
+    nvinfer1::IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) override;
 
     void setPluginNamespace(const char* pluginNamespace) override;
 
     const char* getPluginNamespace() const override;
 
 private:
-    static PluginFieldCollection mFC;
-    static std::vector<PluginField> mPluginAttributes;
+    static nvinfer1::PluginFieldCollection mFC;
+    static std::vector<nvinfer1::PluginField> mPluginAttributes;
     std::string mNamespace;
 };
 }
-#endif // TRT_QKV_TO_CONTEXT_PLUGIN_h
+}
+#endif // TRT_QKV_TO_CONTEXT_PLUGIN_H
diff --git a/demo/BERT/plugins/skipLayerNormPlugin.cu b/demo/BERT/plugins/skipLayerNormPlugin.cu
index a2662914..3c66976f 100644
--- a/demo/BERT/plugins/skipLayerNormPlugin.cu
+++ b/demo/BERT/plugins/skipLayerNormPlugin.cu
@@ -19,16 +19,21 @@
 #include "pluginKernels.h"
 #include "pluginUtil.h"
 #include "skipLayerNormPlugin.h"
+#include "common.h"
 
 #include <cassert>
 #include <cstring>
 #include <vector>
 
+using namespace nvinfer1;
 using bert::operator+;
 
 namespace bert
 {
 
+namespace test
+{
+
 template <typename T, unsigned TPB>
 __global__ void skipLayerNormKernelSmall(
     const int ld, const T* input, const T* skip, const float* beta, const float* gamma, T* output)
@@ -113,22 +118,20 @@ int computeSkipLayerNorm(cudaStream_t stream, const int ld, const int n, const T
     return 0;
 }
 
-using namespace nvinfer1;
-
 // Clip plugin specific constants
 namespace
 {
 static const char* SKIP_LAYER_NORM_VERSION{"1"};
-static const char* SKIP_LAYER_NORM_NAME{"CustomSkipLayerNormPlugin"};
+static const char* SKIP_LAYER_NORM_NAME{"CustomSkipLayerNormPluginDynamic"};
 } // namespace
 
 // Static class fields initialization
-PluginFieldCollection SkipLayerNormPluginCreator::mFC{};
-std::vector<PluginField> SkipLayerNormPluginCreator::mPluginAttributes;
+PluginFieldCollection SkipLayerNormPluginDynamicCreator::mFC{};
+std::vector<PluginField> SkipLayerNormPluginDynamicCreator::mPluginAttributes;
 
-REGISTER_TENSORRT_PLUGIN(SkipLayerNormPluginCreator);
+REGISTER_TENSORRT_PLUGIN(SkipLayerNormPluginDynamicCreator);
 
-SkipLayerNormPlugin::SkipLayerNormPlugin(
+SkipLayerNormPluginDynamic::SkipLayerNormPluginDynamic(
     const std::string name, const int ld, const Weights& beta, const Weights& gamma)
     : mLayerName(name)
     , mLd(ld)
@@ -137,7 +140,7 @@ SkipLayerNormPlugin::SkipLayerNormPlugin(
 {
 }
 
-SkipLayerNormPlugin::SkipLayerNormPlugin(const std::string name, const void* data, size_t length)
+SkipLayerNormPluginDynamic::SkipLayerNormPluginDynamic(const std::string name, const void* data, size_t length)
     : mLayerName(name)
 {
     gLogVerbose << "Skip LN Deser start\n";
@@ -146,7 +149,6 @@ SkipLayerNormPlugin::SkipLayerNormPlugin(const std::string name, const void* dat
     const char* a = d;
     DESER(d, mType);
     DESER(d, mLd);
-    DESER(d, mInputVolume);
     mBetaDev = deserToDev<float>(d, mLd);
     mGammaDev = deserToDev<float>(d, mLd);
     assert(d == (a + length));
@@ -159,65 +161,81 @@ SkipLayerNormPlugin::SkipLayerNormPlugin(const std::string name, const void* dat
     gLogVerbose << "Skip LN Deser done\n";
 }
 
-const char* SkipLayerNormPlugin::getPluginType() const
+// IPluginV2DynamicExt Methods
+IPluginV2DynamicExt* SkipLayerNormPluginDynamic::clone() const
 {
-    return SKIP_LAYER_NORM_NAME;
+    return new SkipLayerNormPluginDynamic(mLayerName, mLd, mBeta, mGamma);
 }
 
-const char* SkipLayerNormPlugin::getPluginVersion() const
+DimsExprs SkipLayerNormPluginDynamic::getOutputDimensions(int outputIndex, const DimsExprs* inputs, int nbInputs, IExprBuilder& exprBuilder)
 {
-    return SKIP_LAYER_NORM_VERSION;
+    assert(nbInputs == 2);
+    assert(outputIndex == 0);
+    assert(inputs[0].nbDims == inputs[1].nbDims);
+    return inputs[0];
 }
 
-int SkipLayerNormPlugin::getNbOutputs() const
+bool SkipLayerNormPluginDynamic::supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs)
 {
-    return 1;
-}
+    assert(nbInputs == 2);
+    assert(nbOutputs == 1);
 
-Dims SkipLayerNormPlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims)
-{
-    // Validate input arguments
-    assert(nbInputDims == 2);
-    assert(index == 0);
-    assert(inputs[0].nbDims == inputs[1].nbDims);
-    for (int d = 0; d < inputs[0].nbDims; d++)
+    const PluginTensorDesc& in = inOut[pos];
+    if (pos == 0)
     {
-        assert(inputs[0].d[d] == inputs[1].d[d]);
+        return (in.type == DataType::kFLOAT || in.type == DataType::kHALF) && (in.format == TensorFormat::kLINEAR);
     }
+    const PluginTensorDesc& prev = inOut[pos - 1];
 
-    return inputs[0];
+    if (pos == 1)
+    {
+        return in.type == prev.type && in.format == prev.format;
+    }
+    // output
+    return in.type == prev.type && in.format == prev.format;
 }
 
-int SkipLayerNormPlugin::initialize()
+void SkipLayerNormPluginDynamic::configurePlugin(const DynamicPluginTensorDesc* inputs, int nbInputs,
+    const DynamicPluginTensorDesc* outputs, int nbOutputs)
+{
+    // Validate input arguments
+    assert(nbOutputs == 1);
+    assert(nbInputs == 2);
+    mType = inputs[0].desc.type;
+    assert(mType == inputs[1].desc.type);
+    const auto& inDims0 = inputs[0].desc.dims;
+    const auto& inDims1 = inputs[1].desc.dims;
+    assert(inDims0.nbDims == inDims1.nbDims);
+
+    assert(std::equal(inDims0.d, inDims0.d + inDims0.nbDims, inDims1.d));
+
+    assert(inDims0.nbDims== 5);
+    mLd = inDims0.d[2]; // hiddensize
+    assert(inDims0.d[3] == 1);
+    assert(inDims0.d[4] == 1);
+}
+
+size_t SkipLayerNormPluginDynamic::getWorkspaceSize(const PluginTensorDesc* inputs, int nbInputs,
+    const PluginTensorDesc* outputs, int nbOutputs) const
 {
-    if (mGamma.values)
-    {
-        CHECK(cudaMalloc(&mGammaDev, sizeof(float) * mGamma.count));
-        CHECK(cudaMemcpy(mGammaDev, mGamma.values, sizeof(float) * mGamma.count, cudaMemcpyHostToDevice));
-    }
-    if (mBeta.values)
-    {
-        CHECK(cudaMalloc(&mBetaDev, sizeof(float) * mBeta.count));
-        CHECK(cudaMemcpy(mBetaDev, mBeta.values, sizeof(float) * mGamma.count, cudaMemcpyHostToDevice));
-    }
     return 0;
 }
 
-int SkipLayerNormPlugin::enqueue(int batchSize, const void* const* inputs, void** outputs, void*, cudaStream_t stream)
+int SkipLayerNormPluginDynamic::enqueue(const PluginTensorDesc* inputDesc,
+    const PluginTensorDesc* outputDesc, const void* const* inputs, void* const* outputs, void* workspace,
+    cudaStream_t stream)
 {
+    const int inputVolume = samplesCommon::volume(inputDesc[0].dims);
     int status = -1;
 
     // Our plugin outputs only one tensor
-
     // Launch CUDA kernel wrapper and save its return value
     if (mType == DataType::kFLOAT)
     {
-
         const float* input = static_cast<const float*>(inputs[0]);
         const float* skip = static_cast<const float*>(inputs[1]);
         float* output = static_cast<float*>(outputs[0]);
-        status = computeSkipLayerNorm<float>(
-            stream, mLd, mInputVolume * batchSize, input, skip, mBetaDev, mGammaDev, output);
+        status = computeSkipLayerNorm<float>(stream, mLd, inputVolume, input, skip, mBetaDev, mGammaDev, output);
     }
     else if (mType == DataType::kHALF)
     {
@@ -225,70 +243,57 @@ int SkipLayerNormPlugin::enqueue(int batchSize, const void* const* inputs, void*
         const half* skip = static_cast<const half*>(inputs[1]);
         half* output = static_cast<half*>(outputs[0]);
 
-        status = computeSkipLayerNorm<half>(
-            stream, mLd, mInputVolume * batchSize, input, skip, mBetaDev, mGammaDev, output);
+        status = computeSkipLayerNorm<half>(stream, mLd, inputVolume, input, skip, mBetaDev, mGammaDev, output);
     }
     else
     {
+        gLogError << "Unsupported Type\n";
         assert(false);
     }
     return status;
 }
 
-size_t SkipLayerNormPlugin::getSerializationSize() const
+// IPluginV2Ext Methods
+DataType SkipLayerNormPluginDynamic::getOutputDataType(int index, const DataType* inputTypes, int nbInputs) const
 {
-    return 2 * sizeof(float) * mLd + sizeof(DataType) + sizeof(mLd) + sizeof(mInputVolume);
+    assert(index == 0);
+    assert(nbInputs == 2);
+    assert(inputTypes[0] == DataType::kFLOAT || inputTypes[0] == DataType::kHALF);
+    assert(inputTypes[0] == inputTypes[1]);
+    return inputTypes[0];
 }
 
-void SkipLayerNormPlugin::serialize(void* buffer) const
+// IPluginV2 Methods
+const char* SkipLayerNormPluginDynamic::getPluginType() const
 {
-    char* d = static_cast<char*>(buffer);
-    const char* a = d;
-
-    writeToBuffer(d, mType);
-    writeToBuffer(d, mLd);
-    writeToBuffer(d, mInputVolume);
-    serFromDev(d, mBetaDev, mLd);
-    serFromDev(d, mGammaDev, mLd);
-    assert(d == a + getSerializationSize());
+    return SKIP_LAYER_NORM_NAME;
 }
 
-void SkipLayerNormPlugin::configureWithFormat(
-    const Dims* inputs, int nbInputs, const Dims* outputs, int nbOutputs, DataType type, PluginFormat format, int)
+const char* SkipLayerNormPluginDynamic::getPluginVersion() const
 {
-    // Validate input arguments
-    assert(nbOutputs == 1);
-    assert(nbInputs == 2);
-
-    // Fetch volume for future enqueue() operations
-    size_t volume = 1;
-    for (int i = 0; i < inputs->nbDims; i++)
-    {
-        volume *= inputs->d[i];
-    }
-    mInputVolume = volume;
-    assert(inputs->nbDims == 4);
-    mLd = inputs->d[1]; // hiddensize
-    assert(inputs->d[2] == 1);
-    assert(inputs->d[3] == 1);
-
-    mType = type;
+    return SKIP_LAYER_NORM_VERSION;
 }
 
-bool SkipLayerNormPlugin::supportsFormat(DataType type, PluginFormat format) const
+int SkipLayerNormPluginDynamic::getNbOutputs() const
 {
-    // This plugin only supports ordinary floats, and NCHW input format
-    if (type == DataType::kFLOAT || type == DataType::kHALF)
+    return 1;
+}
+int SkipLayerNormPluginDynamic::initialize()
+{
+    if (mGamma.values)
     {
-        return format == PluginFormat::kNCHW;
+        CHECK(cudaMalloc(&mGammaDev, sizeof(float) * mGamma.count));
+        CHECK(cudaMemcpy(mGammaDev, mGamma.values, sizeof(float) * mGamma.count, cudaMemcpyHostToDevice));
     }
-    else
+    if (mBeta.values)
     {
-        return false;
+        CHECK(cudaMalloc(&mBetaDev, sizeof(float) * mBeta.count));
+        CHECK(cudaMemcpy(mBetaDev, mBeta.values, sizeof(float) * mGamma.count, cudaMemcpyHostToDevice));
     }
+    return 0;
 }
 
-void SkipLayerNormPlugin::terminate()
+void SkipLayerNormPluginDynamic::terminate()
 {
     gLogVerbose << "SKIPLN terminate start" << std::endl;
     cudaFree(mGammaDev);
@@ -296,66 +301,80 @@ void SkipLayerNormPlugin::terminate()
     gLogVerbose << "SKIPLN terminate done" << std::endl;
 }
 
-void SkipLayerNormPlugin::destroy()
+size_t SkipLayerNormPluginDynamic::getSerializationSize() const
 {
-    // This gets called when the network containing plugin is destroyed
-    delete this;
+    return 2 * sizeof(float) * mLd + sizeof(DataType) + sizeof(mLd) ;
 }
 
-IPluginV2* SkipLayerNormPlugin::clone() const
+void SkipLayerNormPluginDynamic::serialize(void* buffer) const
 {
-    return new SkipLayerNormPlugin(mLayerName, mLd, mBeta, mGamma);
+    char* d = static_cast<char*>(buffer);
+    const char* a = d;
+
+    writeToBuffer(d, mType);
+    writeToBuffer(d, mLd);
+    serFromDev(d, mBetaDev, mLd);
+    serFromDev(d, mGammaDev, mLd);
+    assert(d == a + getSerializationSize());
+}
+
+void SkipLayerNormPluginDynamic::destroy()
+{
+    // This gets called when the network containing plugin is destroyed
+    delete this;
 }
 
-void SkipLayerNormPlugin::setPluginNamespace(const char* libNamespace)
+void SkipLayerNormPluginDynamic::setPluginNamespace(const char* libNamespace)
 {
     mNamespace = libNamespace;
 }
 
-const char* SkipLayerNormPlugin::getPluginNamespace() const
+const char* SkipLayerNormPluginDynamic::getPluginNamespace() const
 {
     return mNamespace.c_str();
 }
 
-SkipLayerNormPluginCreator::SkipLayerNormPluginCreator()
+/////////////////////////////////////////////////////////
+
+SkipLayerNormPluginDynamicCreator::SkipLayerNormPluginDynamicCreator()
 {
     mFC.nbFields = mPluginAttributes.size();
     mFC.fields = mPluginAttributes.data();
 }
 
-const char* SkipLayerNormPluginCreator::getPluginName() const
+const char* SkipLayerNormPluginDynamicCreator::getPluginName() const
 {
     return SKIP_LAYER_NORM_NAME;
 }
 
-const char* SkipLayerNormPluginCreator::getPluginVersion() const
+const char* SkipLayerNormPluginDynamicCreator::getPluginVersion() const
 {
     return SKIP_LAYER_NORM_VERSION;
 }
 
-const PluginFieldCollection* SkipLayerNormPluginCreator::getFieldNames()
+const PluginFieldCollection* SkipLayerNormPluginDynamicCreator::getFieldNames()
 {
     return &mFC;
 }
 
-IPluginV2* SkipLayerNormPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc)
+IPluginV2* SkipLayerNormPluginDynamicCreator::createPlugin(const char* name, const PluginFieldCollection* fc)
 {
-    gLogVerbose << "Creating SkipLayerNormPluginCreator...\n";
+    gLogVerbose << "Creating SkipLayerNormPluginDynamicCreator...\n";
 
     int ld;
     Weights beta;
     Weights gamma;
 
-    for(int i=0; i< fc->nbFields; i++)
+    for (int i = 0; i < fc->nbFields; i++)
     {
         std::string field_name(fc->fields[i].name);
-        if (field_name.compare("ld")==0)
+        if (field_name.compare("ld") == 0)
         {
             ld = *static_cast<const int*>(fc->fields[i].data);
             gLogVerbose << "Building ld: " << ld << std::endl;
         }
 
-        if (field_name.compare("beta")==0)
+        if (field_name.compare("beta") == 0)
         {
             gLogVerbose << "Building beta...\n";
             beta.values = fc->fields[i].data;
@@ -363,7 +382,7 @@ IPluginV2* SkipLayerNormPluginCreator::createPlugin(const char* name, const Plug
             beta.type = static_cast<DataType>(fc->fields[i].type);
         }
 
-        if (field_name.compare("gamma")==0)
+        if (field_name.compare("gamma") == 0)
         {
             gLogVerbose << "Building gamma...\n";
             gamma.values = fc->fields[i].data;
@@ -372,24 +391,25 @@ IPluginV2* SkipLayerNormPluginCreator::createPlugin(const char* name, const Plug
         }
     }
 
-    SkipLayerNormPlugin* p = new SkipLayerNormPlugin(name, ld, beta, gamma);
+    SkipLayerNormPluginDynamic* p = new SkipLayerNormPluginDynamic(name, ld, beta, gamma);
     return p;
 }
 
-IPluginV2* SkipLayerNormPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength)
+IPluginV2* SkipLayerNormPluginDynamicCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength)
 {
     // This object will be deleted when the network is destroyed, which will
-    // call SkipLayerNormPlugin::destroy()
-    return new SkipLayerNormPlugin(name, serialData, serialLength);
+    // call SkipLayerNormPluginDynamic::destroy()
+    return new SkipLayerNormPluginDynamic(name, serialData, serialLength);
 }
 
-void SkipLayerNormPluginCreator::setPluginNamespace(const char* libNamespace)
+void SkipLayerNormPluginDynamicCreator::setPluginNamespace(const char* libNamespace)
 {
     mNamespace = libNamespace;
 }
 
-const char* SkipLayerNormPluginCreator::getPluginNamespace() const
+const char* SkipLayerNormPluginDynamicCreator::getPluginNamespace() const
 {
     return mNamespace.c_str();
 }
 }
+}
diff --git a/demo/BERT/plugins/skipLayerNormPlugin.h b/demo/BERT/plugins/skipLayerNormPlugin.h
index 35ee145c..5d64b706 100644
--- a/demo/BERT/plugins/skipLayerNormPlugin.h
+++ b/demo/BERT/plugins/skipLayerNormPlugin.h
@@ -23,95 +23,88 @@
 
 namespace bert
 {
-
-using namespace nvinfer1;
+namespace test
+{
 
 // One of the preferred ways of making TensorRT to be able to see
 // our custom layer requires extending IPluginV2 and IPluginCreator classes.
 // For requirements for overriden functions, check TensorRT API docs.
 
-class SkipLayerNormPlugin : public IPluginV2
+class SkipLayerNormPluginDynamic : public nvinfer1::IPluginV2DynamicExt
 {
 public:
-    SkipLayerNormPlugin(const std::string name, const int ld, const Weights& beta, const Weights& gamma);
+    SkipLayerNormPluginDynamic(const std::string name, const int ld, const nvinfer1::Weights& beta, const nvinfer1::Weights& gamma);
 
-    SkipLayerNormPlugin(const std::string name, const void* data, size_t length);
+    SkipLayerNormPluginDynamic(const std::string name, const void* data, size_t length);
 
-    // It doesn't make sense to make SkipLayerNormPlugin without arguments, so we
+    // It doesn't make sense to make SkipLayerNormPluginDynamic without arguments, so we
     // delete default constructor.
-    SkipLayerNormPlugin() = delete;
-
+    SkipLayerNormPluginDynamic() = delete;
+
+    // IPluginV2DynamicExt Methods
+    nvinfer1::IPluginV2DynamicExt* clone() const override;
+    nvinfer1::DimsExprs getOutputDimensions(
+        int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, nvinfer1::IExprBuilder& exprBuilder) override;
+    bool supportsFormatCombination(
+        int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) override;
+    void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs,
+        const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) override;
+    size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs,
+        const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const override;
+    int enqueue(const nvinfer1::PluginTensorDesc* inputDesc, const nvinfer1::PluginTensorDesc* outputDesc,
+        const void* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) override;
+
+    // IPluginV2Ext Methods
+    nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const override;
+
+    // IPluginV2 Methods
+    const char* getPluginType() const override;
+    const char* getPluginVersion() const override;
     int getNbOutputs() const override;
-
-    Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override;
-
     int initialize() override;
-
     void terminate() override;
-
-    size_t getWorkspaceSize(int) const override
-    {
-        return 0;
-    };
-
-    int enqueue(
-        int batchSize, const void* const* inputs, void** outputs, void* workspace, cudaStream_t stream) override;
-
     size_t getSerializationSize() const override;
-
     void serialize(void* buffer) const override;
-
-    void configureWithFormat(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, DataType type,
-        PluginFormat format, int maxBatchSize) override;
-
-    bool supportsFormat(DataType type, PluginFormat format) const override;
-
-    const char* getPluginType() const override;
-
-    const char* getPluginVersion() const override;
-
     void destroy() override;
-
-    nvinfer1::IPluginV2* clone() const override;
-
     void setPluginNamespace(const char* pluginNamespace) override;
-
     const char* getPluginNamespace() const override;
 
 private:
     const std::string mLayerName;
-    size_t mInputVolume;
     std::string mNamespace;
 
-    float *mGammaDev, *mBetaDev;
+    float *mGammaDev;
+    float  *mBetaDev;
     size_t mLd; // leading dim
-    Weights mBeta, mGamma;
-    DataType mType;
+    nvinfer1::Weights mBeta;
+    nvinfer1::Weights mGamma;
+    nvinfer1::DataType mType;
 };
 
-class SkipLayerNormPluginCreator : public IPluginCreator
+class SkipLayerNormPluginDynamicCreator : public nvinfer1::IPluginCreator
 {
 public:
-    SkipLayerNormPluginCreator();
+    SkipLayerNormPluginDynamicCreator();
 
     const char* getPluginName() const override;
 
     const char* getPluginVersion() const override;
 
-    const PluginFieldCollection* getFieldNames() override;
+    const nvinfer1::PluginFieldCollection* getFieldNames() override;
 
-    IPluginV2* createPlugin(const char* name, const PluginFieldCollection* fc) override;
+    nvinfer1::IPluginV2* createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc) override;
 
-    IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) override;
+    nvinfer1::IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) override;
 
     void setPluginNamespace(const char* pluginNamespace) override;
 
     const char* getPluginNamespace() const override;
 
 private:
-    static PluginFieldCollection mFC;
-    static std::vector<PluginField> mPluginAttributes;
+    static nvinfer1::PluginFieldCollection mFC;
+    static std::vector<nvinfer1::PluginField> mPluginAttributes;
     std::string mNamespace;
 };
 }
+}
 #endif // TRT_SKIP_LAYER_NORM_PLUGIN_H
diff --git a/demo/BERT/python/BERT_TRT.ipynb b/demo/BERT/python/BERT_TRT.ipynb
index 6c3fa532..07547496 100644
--- a/demo/BERT/python/BERT_TRT.ipynb
+++ b/demo/BERT/python/BERT_TRT.ipynb
@@ -164,10 +164,8 @@
     "import data_processing as dp\n",
     "import tokenization\n",
     "\n",
-    "#Large\n",
-    "#tokenizer = tokenization.FullTokenizer(vocab_file=\"./data/uncased_L-24_H-1024_A-16/vocab.txt\", do_lower_case=True)\n",
     "#Base\n",
-    "tokenizer = tokenization.FullTokenizer(vocab_file=\"./data/uncased_L-12_H-768_A-12/vocab.txt\", do_lower_case=True)\n",
+    "tokenizer = tokenization.FullTokenizer(vocab_file=\"/workspace/models/fine-tuned/bert_tf_v2_base_fp16_384_v2/vocab.txt\", do_lower_case=True)\n",
     "\n",
     "# The maximum number of tokens for the question. Questions longer than this will be truncated to this length.\n",
     "max_query_length = 64\n",
@@ -200,7 +198,7 @@
    "outputs": [],
    "source": [
     "import tensorrt as trt\n",
-    "TRT_LOGGER = trt.Logger(trt.Logger.WARNING)"
+    "TRT_LOGGER = trt.Logger(trt.Logger.INFO)"
    ]
   },
   {
@@ -210,9 +208,10 @@
    "outputs": [],
    "source": [
     "import ctypes\n",
-    "nvinfer =  ctypes.CDLL(\"libnvinfer_plugin.so\", mode = ctypes.RTLD_GLOBAL)\n",
-    "cm = ctypes.CDLL(\"./build/libcommon.so\", mode = ctypes.RTLD_GLOBAL) \n",
-    "pg = ctypes.CDLL(\"./build/libbert_plugins.so\", mode = ctypes.RTLD_GLOBAL) "
+    "import os\n",
+    "ctypes.CDLL(\"libnvinfer_plugin.so\", mode=ctypes.RTLD_GLOBAL)\n",
+    "ctypes.CDLL(\"/workspace/TensorRT/demo/BERT/build/libcommon.so\", mode=ctypes.RTLD_GLOBAL)\n",
+    "ctypes.CDLL(\"/workspace/TensorRT/demo/BERT/build/libbert_plugins.so\", mode=ctypes.RTLD_GLOBAL)"
    ]
   },
   {
@@ -226,57 +225,52 @@
     "import numpy as np\n",
     "import time\n",
     "\n",
-    "# For this example we are going to use batch size 1\n",
-    "max_batch_size = 1\n",
-    "\n",
-    "# Load the Large BERT Engine\n",
-    "# with open(\"./bert_python.engine\", \"rb\") as f, \\\n",
-    "#    trt.Runtime(TRT_LOGGER) as runtime, \\\n",
-    "#    runtime.deserialize_cuda_engine(f.read()) as engine, \\\n",
-    "#    engine.create_execution_context() as context:\n",
-    "\n",
     "# Load the Base BERT Engine\n",
-    "with open(\"./bert_python_base.engine\", \"rb\") as f, \\\n",
+    "with open(\"/workspace/TensorRT/demo/BERT/bert_base_384.engine\", \"rb\") as f, \\\n",
     "    trt.Runtime(TRT_LOGGER) as runtime, \\\n",
     "    runtime.deserialize_cuda_engine(f.read()) as engine, \\\n",
     "    engine.create_execution_context() as context:\n",
     "\n",
-    "    print(\"List engine binding:\")\n",
-    "    for binding in engine:\n",
-    "        print(\" - {}: {}, Shape {}, {}\".format(\n",
-    "            \"Input\" if engine.binding_is_input(binding) else \"Output\",\n",
-    "            binding,\n",
-    "            engine.get_binding_shape(binding),\n",
-    "            engine.get_binding_dtype(binding)))\n",
-    "\n",
+    "     # We always use batch size 1.\n",
+    "    input_shape = (1, max_seq_length)\n",
+    "    input_nbytes = trt.volume(input_shape) * trt.int32.itemsize\n",
     "    \n",
-    "    def binding_nbytes(binding):\n",
-    "        return trt.volume(engine.get_binding_shape(binding)) * engine.get_binding_dtype(binding).itemsize\n",
-    "    \n",
-    "    # Allocate device memory for inputs and outputs.\n",
-    "    d_inputs = [cuda.mem_alloc(binding_nbytes(binding)) for binding in engine if engine.binding_is_input(binding)]\n",
-    "    h_output = cuda.pagelocked_empty(tuple(engine.get_binding_shape(3)), dtype=np.float32)\n",
-    "    d_output = cuda.mem_alloc(h_output.nbytes)\n",
-    "\n",
+    "    # Allocate device memory for inputs.\n",
+    "    d_inputs = [cuda.mem_alloc(input_nbytes) for binding in range(3)]\n",
     "    # Create a stream in which to copy inputs/outputs and run inference.\n",
     "    stream = cuda.Stream()\n",
     "\n",
+    "    # Specify input shapes. These must be within the min/max bounds of the active profile (0th profile in this case)\n",
+    "    # Note that input shapes can be specified on a per-inference basis, but in this case, we only have a single shape.\n",
+    "    for binding in range(3):\n",
+    "        context.set_binding_shape(binding, input_shape)\n",
+    "    assert context.all_binding_shapes_specified\n",
+    "\n",
+    "    # Allocate output buffer by querying the size from the context. This may be different for different input shapes.\n",
+    "    h_output = cuda.pagelocked_empty(tuple(context.get_binding_shape(3)), dtype=np.float32)\n",
+    "    d_output = cuda.mem_alloc(h_output.nbytes)\n",
+    "\n",
     "    print(\"\\nRunning Inference...\")\n",
     "    eval_start_time = time.time()\n",
     "\n",
     "    # Copy inputs\n",
-    "    cuda.memcpy_htod_async(d_inputs[0], input_features[\"input_ids\"], stream)\n",
-    "    cuda.memcpy_htod_async(d_inputs[1], input_features[\"segment_ids\"], stream)\n",
-    "    cuda.memcpy_htod_async(d_inputs[2], input_features[\"input_mask\"], stream)\n",
+    "    cuda.memcpy_htod_async(d_inputs[0], features[\"input_ids\"], stream)\n",
+    "    cuda.memcpy_htod_async(d_inputs[1], features[\"segment_ids\"], stream)\n",
+    "    cuda.memcpy_htod_async(d_inputs[2], features[\"input_mask\"], stream)\n",
     "\n",
     "    # Run inference\n",
-    "    context.execute_async(bindings=[int(d_inp) for d_inp in d_inputs] + [int(d_output)], stream_handle=stream.handle)\n",
+    "    context.execute_async_v2(bindings=[int(d_inp) for d_inp in d_inputs] + [int(d_output)], stream_handle=stream.handle)\n",
     "    # Transfer predictions back from GPU\n",
     "    cuda.memcpy_dtoh_async(h_output, d_output, stream)\n",
     "    # Synchronize the stream\n",
     "    stream.synchronize()\n",
     "\n",
-    "    eval_time_elapsed = time.time() - eval_start_time"
+    "    eval_time_elapsed = time.time() - eval_start_time\n",
+    "    \n",
+    "    print(\"-----------------------------\")\n",
+    "    print(\"Running Inference in {:.3f} Sentences/Sec\".format(1.0/eval_time_elapsed))\n",
+    "    print(\"-----------------------------\")\n",
+    "    "
    ]
   },
   {
@@ -299,28 +293,25 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "start_logits = h_output[:, 0]\n",
-    "end_logits = h_output[:, 1]\n",
+    "for index, batch in enumerate(h_output):\n",
+    "    start_logits = batch[:, 0]\n",
+    "    end_logits = batch[:, 1]\n",
     "\n",
-    "# The total number of n-best predictions to generate in the nbest_predictions.json output file\n",
-    "n_best_size = 20\n",
+    "    # The total number of n-best predictions to generate in the nbest_predictions.json output file\n",
+    "    n_best_size = 20\n",
     "\n",
-    "# The maximum length of an answer that can be generated. This is needed \n",
-    "#  because the start and end predictions are not conditioned on one another\n",
-    "max_answer_length = 30\n",
+    "    # The maximum length of an answer that can be generated. This is needed \n",
+    "    #  because the start and end predictions are not conditioned on one another\n",
+    "    max_answer_length = 30\n",
     "\n",
     "\n",
-    "(prediction, nbest_json, scores_diff_json) = \\\n",
-    "        dp.get_predictions(doc_tokens, features, \\\n",
-    "                       start_logits, end_logits, n_best_size, max_answer_length)\n",
+    "    (prediction, nbest_json, scores_diff_json) = \\\n",
+    "        dp.get_predictions(doc_tokens, features, start_logits, end_logits, n_best_size, max_answer_length)\n",
     "\n",
     "\n",
-    "print(\"-----------------------------\")\n",
-    "print(\"Running Inference in {:.3f} Sentences/Sec\".format(1.0/eval_time_elapsed))\n",
-    "print(\"-----------------------------\")\n",
-    "    \n",
-    "print(\"Answer: '{}'\".format(prediction))\n",
-    "print(\"with prob: {:.3f}%\".format(nbest_json[0]['probability']*100.0))\n",
+    "    print(\"Processing output {:} in batch\".format(index))\n",
+    "    print(\"Answer: '{}'\".format(prediction))\n",
+    "    print(\"with prob: {:.3f}%\".format(nbest_json[0]['probability'] * 100.0))\n",
     "\n"
    ]
   }
diff --git a/demo/BERT/python/README.md b/demo/BERT/python/README.md
index ca4684d9..782e2b31 100644
--- a/demo/BERT/python/README.md
+++ b/demo/BERT/python/README.md
@@ -25,7 +25,19 @@ python python/bert_builder.py -m /workspace/models/fine-tuned/bert_tf_v2_base_fp
 This will build and engine with a maximum batch size of 1 (`-b 1`), and sequence length of 384 (`-s 384`) using the `bert_config.json` file located in `workspace/models/fine-tuned/bert_tf_v2_base_fp16_384_v2`
 
 ## Running Inference
-Finally, you can run inference with the engine generated from the previous step using the `bert_inference.py` script. This script also accepts a passage and a question. For example,
+
+### Using the Python Script
+You can run inference with the engine generated from the previous step using the `bert_inference.py` script.
+This script accepts a passage and a question. For example,
+```
+python python/bert_inference.py -e bert_base_384.engine -p "TensorRT is a high performance deep learning inference platform that delivers low latency and high throughput for apps such as recommenders, speech and image/video on NVIDIA GPUs. It includes parsers to import models, and plugins to support novel ops and layers before applying optimizations for inference. Today NVIDIA is open sourcing parsers and plugins in TensorRT so that the deep learning community can customize and extend these components to take advantage of powerful TensorRT optimizations for your apps." -q "What is TensorRT?" -v /workspace/models/fine-tuned/bert_tf_v2_base_fp16_384_v2/vocab.txt
+```
+
+### Using the Jupyter Notebook
+Alternatively, you can run inference using the included Jupyter notebook.
+To launch the jupyter notebook from inside the container, run:
 ```
-python python/bert_inference.py -e bert_base_384.engine -p "TensorRT is a high performance deep learning inference platform that delivers low latency and high throughput for apps such as recommenders, speech and image/video on NVIDIA GPUs. It includes parsers to import models, and plugins to support novel ops and layers before applying optimizations for inference. Today NVIDIA is open sourcing parsers and plugins in TensorRT so that the deep learning community can customize and extend these components to take advantage of powerful TensorRT optimizations for your apps." -q "What is TensorRT?" -v /workspace/models/fine-tuned/bert_tf_v2_base_fp16_384_v2/vocab.txt -b 1
+jupyter notebook --ip 0.0.0.0 python/BERT_TRT.ipynb
 ```
+Then, use your browser to open the link displayed.
+The link should look something like this: `http://127.0.0.1:8888/?token=<TOKEN>`
diff --git a/demo/BERT/python/bert_builder.py b/demo/BERT/python/bert_builder.py
index 6a34491e..586d607e 100644
--- a/demo/BERT/python/bert_builder.py
+++ b/demo/BERT/python/bert_builder.py
@@ -28,21 +28,21 @@
     sys.stderr.write("""Error: Failed to import tensorflow module ({})""".format(err))
     sys.exit()
 
-nvinfer = ctypes.CDLL("libnvinfer_plugin.so", mode=ctypes.RTLD_GLOBAL)
-cm = ctypes.CDLL("/workspace/TensorRT/demo/BERT/build/libcommon.so", mode=ctypes.RTLD_GLOBAL)
-pg = ctypes.CDLL("/workspace/TensorRT/demo/BERT/build/libbert_plugins.so", mode=ctypes.RTLD_GLOBAL)
-
-
 """
 TensorRT Initialization
 """
 TRT_LOGGER = trt.Logger(trt.Logger.INFO)
+
+ctypes.CDLL("libnvinfer_plugin.so", mode=ctypes.RTLD_GLOBAL)
+ctypes.CDLL("/workspace/TensorRT/demo/BERT/build/libcommon.so", mode=ctypes.RTLD_GLOBAL)
+ctypes.CDLL("/workspace/TensorRT/demo/BERT/build/libbert_plugins.so", mode=ctypes.RTLD_GLOBAL)
+
 trt.init_libnvinfer_plugins(TRT_LOGGER, "")
 plg_registry = trt.get_plugin_registry()
-qkv2_plg_creator = plg_registry.get_plugin_creator("CustomQKVToContextPlugin", "1", "")
-skln_plg_creator = plg_registry.get_plugin_creator("CustomSkipLayerNormPlugin", "1", "")
-gelu_plg_creator = plg_registry.get_plugin_creator("CustomGeluPlugin", "1", "")
-emln_plg_creator = plg_registry.get_plugin_creator("CustomEmbLayerNormPlugin", "1", "")
+qkv2_plg_creator = plg_registry.get_plugin_creator("CustomQKVToContextPluginDynamic", "1", "")
+skln_plg_creator = plg_registry.get_plugin_creator("CustomSkipLayerNormPluginDynamic", "1", "")
+gelu_plg_creator = plg_registry.get_plugin_creator("CustomGeluPluginDynamic", "1", "")
+emln_plg_creator = plg_registry.get_plugin_creator("CustomEmbLayerNormPluginDynamic", "1", "")
 
 
 """
@@ -79,7 +79,6 @@
 SQD_W = "squad_output_weights"
 SQD_B = "squad_output_bias"
 
-
 class BertConfig:
     def __init__(self, bert_config_path):
         with open(bert_config_path, 'r') as f:
@@ -101,8 +100,8 @@ def attention_layer_opt(prefix, config, init_dict, network, input_tensor, imask)
     """
     Add the attention layer
     """
-    assert(len(input_tensor.shape) == 4)
-    S, hidden_size, _, _ = input_tensor.shape
+    assert(len(input_tensor.shape) == 5)
+    B, S, hidden_size, _, _ = input_tensor.shape
     num_heads = config.num_attention_heads
     head_size = int(hidden_size / num_heads)
 
@@ -112,14 +111,13 @@ def attention_layer_opt(prefix, config, init_dict, network, input_tensor, imask)
     mult_all = network.add_fully_connected(input_tensor, 3 * hidden_size, Wall, Ball)
     set_layer_name(mult_all, prefix, "qkv_mult")
 
-    has_mask = imask != None
+    has_mask = imask is not None
 
     pf_hidden_size = trt.PluginField("hidden_size", np.array([hidden_size], np.int32), trt.PluginFieldType.INT32)
     pf_num_heads = trt.PluginField("num_heads", np.array([num_heads], np.int32), trt.PluginFieldType.INT32)
-    pf_S = trt.PluginField("S", np.array([S], np.int32), trt.PluginFieldType.INT32)
     pf_has_mask = trt.PluginField("has_mask", np.array([has_mask], np.int32), trt.PluginFieldType.INT32)
 
-    pfc = trt.PluginFieldCollection([pf_hidden_size, pf_num_heads, pf_S, pf_has_mask])
+    pfc = trt.PluginFieldCollection([pf_hidden_size, pf_num_heads, pf_has_mask])
     qkv2ctx_plug = qkv2_plg_creator.create_plugin("qkv2ctx", pfc)
 
     qkv_in = [mult_all.get_output(0), imask]
@@ -133,8 +131,8 @@ def skipln(prefix, init_dict, network, input_tensor, skip):
     Add the skip layer
     """
     idims = input_tensor.shape
-    assert len(idims) == 4
-    hidden_size = idims[1]
+    assert len(idims) == 5
+    hidden_size = idims[2]
 
     pf_ld = trt.PluginField("ld", np.array([hidden_size], np.int32), trt.PluginFieldType.INT32)
     wbeta = init_dict[prefix + "beta"]
@@ -155,8 +153,8 @@ def transformer_layer_opt(prefix, config, init_dict, network, input_tensor, imas
     Add the transformer layer
     """
     idims = input_tensor.shape
-    assert len(idims) == 4
-    hidden_size = idims[1]
+    assert len(idims) == 5
+    hidden_size = idims[2]
 
     context_transposed = attention_layer_opt(prefix + "attention_self_", config, init_dict, network, input_tensor, imask)
     attention_heads = context_transposed.get_output(0)
@@ -213,8 +211,8 @@ def squad_output(prefix, config, init_dict, network, input_tensor):
     """
 
     idims = input_tensor.shape
-    assert len(idims) == 4
-    S, hidden_size, _, _ = idims
+    assert len(idims) == 5
+    B, S, hidden_size, _, _ = idims
 
     W_out = init_dict[prefix + SQD_W]
     B_out = init_dict[prefix + SQD_B]
@@ -238,7 +236,7 @@ def load_weights(inputbase):
         # There might be training-related variables in the checkpoint that can be discarded
         param_names = [key for key in sorted(tensor_dict) if 'adam' not in key and 'global_step' not in key and 'pooler' not in key]
         count = len(param_names)
-        TRT_LOGGER.log(TRT_LOGGER.INFO, str(count))
+        TRT_LOGGER.log(TRT_LOGGER.INFO, "Found {:} entries in weight map".format(count))
 
         for pn in param_names:
             toks = pn.lower().split('/')
@@ -252,7 +250,7 @@ def load_weights(inputbase):
             tensor = reader.get_tensor(pn)
             shape = tensor.shape
             if pn.find('kernel') != -1:
-                TRT_LOGGER.log(TRT_LOGGER.INFO, "Transposing {}\n".format(np))
+                TRT_LOGGER.log(TRT_LOGGER.VERBOSE, "Transposing {}\n".format(np))
                 tensor = np.transpose(tensor)
 
             shape = tensor.shape
@@ -260,7 +258,7 @@ def load_weights(inputbase):
             shape_str = '{} '.format(len(shape)) + ' '.join([str(d) for d in shape])
             weights_dict[outname] = trt.Weights(flat_tensor)
 
-            TRT_LOGGER.log(TRT_LOGGER.INFO, "Orig.name: {:}, TRT name: {:}, shape: {:}".format(pn, outname, shape_str))
+            TRT_LOGGER.log(TRT_LOGGER.VERBOSE, "Orig.name: {:}, TRT name: {:}, shape: {:}".format(pn, outname, shape_str))
 
         additional_dict = dict()
         for key, value in weights_dict.items():
@@ -300,18 +298,13 @@ def load_weights(inputbase):
 
 def main(inputbase, B, S, bert_path, outputbase):
     bert_config_path = os.path.join(bert_path, 'bert_config.json')
-    TRT_LOGGER.log(TRT_LOGGER.INFO, bert_config_path)
+    TRT_LOGGER.log(TRT_LOGGER.INFO, "Using configuration file: {:}".format(bert_config_path))
     config = BertConfig(bert_config_path)
 
     # Load weights from checkpoint file
     init_dict = load_weights(inputbase)
 
     with trt.Builder(TRT_LOGGER) as builder:
-        builder.max_batch_size = B
-        builder.max_workspace_size = 5000 * (1024 * 1024)
-        builder.fp16_mode = True
-        builder.strict_type_constraints = False
-
         ty = trt.PluginFieldType.FLOAT32
 
         w = init_dict["bert_embeddings_layernorm_beta"]
@@ -332,10 +325,36 @@ def main(inputbase, B, S, bert_path, outputbase):
         pfc = trt.PluginFieldCollection([wbeta, wgamma, wwordemb, wtokemb, wposemb])
         fn = emln_plg_creator.create_plugin("embeddings", pfc)
 
-        with builder.create_network() as network:
-            input_ids = network.add_input(name="input_ids", dtype=trt.int32, shape=(S, ))
-            segment_ids = network.add_input(name="segment_ids", dtype=trt.int32, shape=(S, ))
-            input_mask = network.add_input(name="input_mask", dtype=trt.int32, shape=(S, ))
+        explicit_batch_flag = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+        with builder.create_network(explicit_batch_flag) as network, builder.create_builder_config() as builder_config:
+            builder_config.max_workspace_size = 5000 * (1024 * 1024) # 5000 MiB
+            builder_config.set_flag(trt.BuilderFlag.FP16)
+
+            input_ids = network.add_input(name="input_ids", dtype=trt.int32, shape=(-1, S))
+            segment_ids = network.add_input(name="segment_ids", dtype=trt.int32, shape=(-1, S))
+            input_mask = network.add_input(name="input_mask", dtype=trt.int32, shape=(-1, S))
+
+            def set_profile_shape(profile, batch_size):
+                shape = (batch_size, S)
+                profile.set_shape("input_ids", min=shape, opt=shape, max=shape)
+                profile.set_shape("segment_ids", min=shape, opt=shape, max=shape)
+                profile.set_shape("input_mask", min=shape, opt=shape, max=shape)
+
+            # Specify profiles for the batch sizes we're interested in.
+            # For maximum performance, we will tie each profile to exactly one shape rather than a range.
+            bs1_profile = builder.create_optimization_profile()
+            set_profile_shape(bs1_profile, 1)
+            builder_config.add_optimization_profile(bs1_profile)
+
+            bs_user_profile = builder.create_optimization_profile()
+            set_profile_shape(bs_user_profile, B)
+            builder_config.add_optimization_profile(bs_user_profile)
+
+            bs8_profile = builder.create_optimization_profile()
+            set_profile_shape(bs8_profile, 8)
+            builder_config.add_optimization_profile(bs8_profile)
+
+            # Create the network
             inputs = [input_ids, segment_ids, input_mask]
             emb_layer = network.add_plugin_v2(inputs, fn)
 
@@ -346,25 +365,26 @@ def main(inputbase, B, S, bert_path, outputbase):
 
             squad_logits = squad_output("cls_", config, init_dict, network, bert_out)
             squad_logits_out = squad_logits.get_output(0)
+
             network.mark_output(squad_logits_out)
 
-            engine = builder.build_cuda_engine(network)
 
-            TRT_LOGGER.log(TRT_LOGGER.INFO, "Serializing the engine....")
-            serialized_engine = engine.serialize()
-            TRT_LOGGER.log(TRT_LOGGER.INFO, "Saving the engine....")
-            with open(outputbase, 'wb') as fout:
-                fout.write(serialized_engine)
-            TRT_LOGGER.log(TRT_LOGGER.INFO, "Done.")
+            with builder.build_engine(network, builder_config) as engine:
+                TRT_LOGGER.log(TRT_LOGGER.VERBOSE, "Serializing Engine...")
+                serialized_engine = engine.serialize()
+                TRT_LOGGER.log(TRT_LOGGER.INFO, "Saving Engine to {:}".format(outputbase))
+                with open(outputbase, 'wb') as fout:
+                    fout.write(serialized_engine)
+                TRT_LOGGER.log(TRT_LOGGER.INFO, "Done.")
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description='TensorRT BERT Sample')
+    parser = argparse.ArgumentParser(description='TensorRT BERT Sample', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     parser.add_argument('-m', '--model', required=True,
-                        help='The checkpoint file basename, example basename(model.ckpt-766908.data-00000-of-00001)-> model.ckpt-766908')
-    parser.add_argument('-o', '--output', required=True, help='The bert engine file, ex bert.engine')
-    parser.add_argument('-b', '--batchsize', required=False, default=1, help='Batch size (default=1)')
-    parser.add_argument('-s', '--sequence', required=False, default=384, help='Sequence length of the BERT model (default=384)')
+                        help='The checkpoint file basename, e.g.: basename(model.ckpt-766908.data-00000-of-00001) is model.ckpt-766908')
+    parser.add_argument('-o', '--output', required=True, default="bert_base_384.engine", help='The bert engine file, ex bert.engine')
+    parser.add_argument('-b', '--batchsize', default=1, help='Batch size')
+    parser.add_argument('-s', '--sequence', default=384, help='Sequence length of the BERT model')
     parser.add_argument('-c', '--config', required=True,
                         help='The folder containing the bert_config.json, which can be downloaded e.g. from https://github.com/google-research/bert#pre-trained-models or by running download_models.py in dle/TensorFlow/LanguageModeling/BERT/data/pretrained_models_google')
 
@@ -376,5 +396,3 @@ def main(inputbase, B, S, bert_path, outputbase):
     S = int(opt.sequence)
     bert_path = opt.config
     main(inputbase, B, S, bert_path, outputbase)
-    # Required to work around a double free issue in TRT 5.1
-    os._exit(0)
diff --git a/demo/BERT/python/bert_inference.py b/demo/BERT/python/bert_inference.py
index 4e3e8e25..3ac8a5a8 100644
--- a/demo/BERT/python/bert_inference.py
+++ b/demo/BERT/python/bert_inference.py
@@ -23,7 +23,7 @@
 import pycuda.driver as cuda
 import pycuda.autoinit
 
-TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
+TRT_LOGGER = trt.Logger(trt.Logger.INFO)
 
 def parse_args():
     """
@@ -32,24 +32,23 @@ def parse_args():
     parser = argparse.ArgumentParser(description='BERT QA Inference')
     parser.add_argument('-e', '--bert_engine', dest='bert_engine',
             help='Path to BERT TensorRT engine')
-    parser.add_argument('-p', '--passage', nargs='*', dest='passage',
+    parser.add_argument('-p', '--passage', nargs='*',
             help='Text for paragraph/passage for BERT QA',
             default='')
-    parser.add_argument('-pf', '--passage_file', dest='passage_file',
+    parser.add_argument('-pf', '--passage-file',
             help='File containing input passage',
             default='')
-    parser.add_argument('-q', '--question', nargs='*', dest='question',
+    parser.add_argument('-q', '--question', nargs='*',
             help='Text for query/question for BERT QA',
             default='')
-    parser.add_argument('-qf', '--question_file', dest='question_file',
+    parser.add_argument('-qf', '--question-file',
             help='File containiner input question',
             default='')
-    parser.add_argument('-v', '--vocab_file', dest='vocab_file',
+    parser.add_argument('-v', '--vocab-file',
             help='Path to file containing entire understandable vocab',
             default='./pre-trained_model/uncased_L-24_H-1024_A-16/vocab.txt')
-    parser.add_argument('-b', '--batch_size', dest='batch_size',
-            help='Batch size for inference', default=1, type=int)
-    return parser.parse_args()
+    args, _ = parser.parse_known_args()
+    return args
 
 if __name__ == '__main__':
     args = parse_args()
@@ -87,35 +86,44 @@ def question_features(question):
         return dp.convert_examples_to_features(doc_tokens, question, tokenizer, max_seq_length, doc_stride, max_query_length)
 
     # Import necessary plugins for BERT TensorRT
-    nvinfer =  ctypes.CDLL("libnvinfer_plugin.so", mode=ctypes.RTLD_GLOBAL)
-    cm = ctypes.CDLL("/workspace/TensorRT/demo/BERT/build/libcommon.so", mode=ctypes.RTLD_GLOBAL)
-    pg = ctypes.CDLL("/workspace/TensorRT/demo/BERT/build/libbert_plugins.so", mode=ctypes.RTLD_GLOBAL)
+    ctypes.CDLL("libnvinfer_plugin.so", mode=ctypes.RTLD_GLOBAL)
+    ctypes.CDLL("/workspace/TensorRT/demo/BERT/build/libcommon.so", mode=ctypes.RTLD_GLOBAL)
+    ctypes.CDLL("/workspace/TensorRT/demo/BERT/build/libbert_plugins.so", mode=ctypes.RTLD_GLOBAL)
 
+    # The first context created will use the 0th profile. A new context must be created
+    # for each additional profile needed. Here, we only use batch size 1, thus we only need the first profile.
     with open(args.bert_engine, 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime, \
         runtime.deserialize_cuda_engine(f.read()) as engine, engine.create_execution_context() as context:
 
-        def binding_nbytes(binding):
-            return trt.volume(engine.get_binding_shape(binding)) * engine.get_binding_dtype(binding).itemsize
-
-        # Allocate device memory for inputs and outputs.
-        d_inputs = [cuda.mem_alloc(binding_nbytes(binding)) for binding in engine if engine.binding_is_input(binding)]
-        h_output = cuda.pagelocked_empty(tuple(engine.get_binding_shape(3)), dtype=np.float32)
-        d_output = cuda.mem_alloc(h_output.nbytes)
+        # We always use batch size 1.
+        input_shape = (1, max_seq_length)
+        input_nbytes = trt.volume(input_shape) * trt.int32.itemsize
 
+        # Allocate device memory for inputs.
+        d_inputs = [cuda.mem_alloc(input_nbytes) for binding in range(3)]
         # Create a stream in which to copy inputs/outputs and run inference.
         stream = cuda.Stream()
 
-        def inference(input_features):
+        # Specify input shapes. These must be within the min/max bounds of the active profile (0th profile in this case)
+        # Note that input shapes can be specified on a per-inference basis, but in this case, we only have a single shape.
+        for binding in range(3):
+            context.set_binding_shape(binding, input_shape)
+        assert context.all_binding_shapes_specified
+
+        # Allocate output buffer by querying the size from the context. This may be different for different input shapes.
+        h_output = cuda.pagelocked_empty(tuple(context.get_binding_shape(3)), dtype=np.float32)
+        d_output = cuda.mem_alloc(h_output.nbytes)
+
+        def inference(features):
             print("\nRunning Inference...")
             eval_start_time = time.time()
 
             # Copy inputs
-            cuda.memcpy_htod_async(d_inputs[0], input_features["input_ids"], stream)
-            cuda.memcpy_htod_async(d_inputs[1], input_features["segment_ids"], stream)
-            cuda.memcpy_htod_async(d_inputs[2], input_features["input_mask"], stream)
-
+            cuda.memcpy_htod_async(d_inputs[0], features["input_ids"], stream)
+            cuda.memcpy_htod_async(d_inputs[1], features["segment_ids"], stream)
+            cuda.memcpy_htod_async(d_inputs[2], features["input_mask"], stream)
             # Run inference
-            context.execute_async(bindings=[int(d_inp) for d_inp in d_inputs] + [int(d_output)], stream_handle=stream.handle)
+            context.execute_async_v2(bindings=[int(d_inp) for d_inp in d_inputs] + [int(d_output)], stream_handle=stream.handle)
             # Transfer predictions back from GPU
             cuda.memcpy_dtoh_async(h_output, d_output, stream)
             # Synchronize the stream
@@ -123,26 +131,28 @@ def inference(input_features):
 
             eval_time_elapsed = time.time() - eval_start_time
 
-            # Data Post-processing
-            start_logits = h_output[:, 0]
-            end_logits = h_output[:, 1]
+            print("------------------------")
+            print("Running inference in {:.3f} Sentences/Sec".format(1.0/eval_time_elapsed))
+            print("------------------------")
 
-            # Total number of n-best predictions to generate in the nbest_predictions.json output file
-            n_best_size = 20
+            for index, batch in enumerate(h_output):
+                # Data Post-processing
+                start_logits = batch[:, 0]
+                end_logits = batch[:, 1]
 
-            # The maximum length of an answer that can be generated. This is needed
-            # because the start and end predictions are not conditioned on one another
-            max_answer_length = 30
+                # Total number of n-best predictions to generate in the nbest_predictions.json output file
+                n_best_size = 20
 
-            prediction, nbest_json, scores_diff_json = dp.get_predictions(doc_tokens, input_features,
-                    start_logits, end_logits, n_best_size, max_answer_length)
+                # The maximum length of an answer that can be generated. This is needed
+                # because the start and end predictions are not conditioned on one another
+                max_answer_length = 30
 
-            print("------------------------")
-            print("Running inference in {:.3f} Sentences/Sec".format(1.0/eval_time_elapsed))
-            print("------------------------")
+                prediction, nbest_json, scores_diff_json = dp.get_predictions(doc_tokens, features,
+                        start_logits, end_logits, n_best_size, max_answer_length)
 
-            print("Answer: '{}'".format(prediction))
-            print("With probability: {:.3f}".format(nbest_json[0]['probability']*100.0))
+                print("Processing output {:} in batch".format(index))
+                print("Answer: '{}'".format(prediction))
+                print("With probability: {:.3f}".format(nbest_json[0]['probability'] * 100.0))
 
         if question_text:
             print("\nQuestion: {}".format(question_text))
diff --git a/demo/BERT/python/create_docker_container.sh b/demo/BERT/python/create_docker_container.sh
index 58f941aa..15f4fad3 100755
--- a/demo/BERT/python/create_docker_container.sh
+++ b/demo/BERT/python/create_docker_container.sh
@@ -32,6 +32,8 @@ docker run -it --rm \
     --shm-size=1g \
     --ulimit memlock=1 \
     --ulimit stack=67108864 \
+    --publish 0.0.0.0:8888:8888 \
+    --ip 0.0.0.0 \
     -u $(id -u):$(id -g) \
     -v ${HOME}/models:/models \
     -v ${TENSORRT_DIR}:/workspace/TensorRT \
diff --git a/demo/BERT/sampleBERT.cpp b/demo/BERT/sampleBERT.cpp
index 5504f112..06e9c606 100644
--- a/demo/BERT/sampleBERT.cpp
+++ b/demo/BERT/sampleBERT.cpp
@@ -33,6 +33,7 @@
 #include <sys/stat.h>
 #include <vector>
 
+#include "bert.h"
 #include "bertUtils.h"
 
 #include "dataUtils.h"
@@ -42,198 +43,34 @@
 #include "squad.h"
 
 using namespace bert;
+using namespace nvinfer1;
 
 Args gArgs;
 
-const std::string gSampleName = "TensorRT.sample_bert";
-const std::string TEST_INPUT_NAME = "test_inputs.weights_int32";
-const std::string TEST_OUTPUT_NAME = "test_outputs.weights";
-const std::string BERT_WEIGHTS_NAME = "bert.weights";
-const int NUM_RUNS = 10;
-
-void doInference(IExecutionContext& context, const std::map<std::string, nvinfer1::Weights>& inCfg,
-    std::map<std::string, std::vector<float>>& outCfg, const int batchSize, cudaStream_t stream,
-    std::vector<float>& timesTotal, std::vector<float>& timesCompute, int verbose = 1)
-{
-
-    const int numRuns = timesTotal.size();
-    assert(numRuns == timesCompute.size());
-    assert(numRuns > 0);
-
-    const ICudaEngine& engine = context.getEngine();
-    const int numBindings = engine.getNbBindings();
-    assert(numBindings == inCfg.size() + outCfg.size());
-    std::vector<void*> buffers(numBindings);
-    allocBindingsFromWeights(engine, buffers, batchSize, inCfg, verbose);
-    allocBindingsFromVectors(engine, buffers, batchSize, outCfg, verbose);
-
-    void** bs = buffers.data();
-
-    std::vector<cudaEvent_t> startsTotal(numRuns);
-    std::vector<cudaEvent_t> stopsTotal(numRuns);
-    std::vector<cudaEvent_t> startsCompute(numRuns);
-    std::vector<cudaEvent_t> stopsCompute(numRuns);
-
-    for (int it = 0; it < numRuns; it++)
-    {
-        cudaEventCreate(&startsTotal[it]);
-        cudaEventCreate(&stopsTotal[it]);
-
-        cudaEventCreate(&startsCompute[it]);
-        cudaEventCreate(&stopsCompute[it]);
-    }
-
-    cudaProfilerStart();
-    for (int it = 0; it < numRuns; it++)
-    {
-        CHECK(cudaEventRecord(startsTotal[it], stream));
-        copyToDeviceBindings(engine, buffers, batchSize, inCfg, stream);
-        CHECK(cudaEventRecord(startsCompute[it], stream));
-        context.enqueue(batchSize, bs, stream, nullptr);
-        CHECK(cudaEventRecord(stopsCompute[it], stream));
-        copyFromDeviceBindings(engine, buffers, batchSize, outCfg, stream);
-        CHECK(cudaEventRecord(stopsTotal[it], stream));
-    }
-    CHECK(cudaDeviceSynchronize());
-
-    cudaProfilerStop();
-    float milliseconds = 0;
-    for (int it = 0; it < numRuns; it++)
-    {
-        cudaEventElapsedTime(&milliseconds, startsTotal[it], stopsTotal[it]);
-        timesTotal[it] = milliseconds;
-        cudaEventElapsedTime(&milliseconds, startsCompute[it], stopsCompute[it]);
-        timesCompute[it] = milliseconds;
-
-        cudaEventDestroy(startsTotal[it]);
-        cudaEventDestroy(stopsTotal[it]);
-        cudaEventDestroy(startsCompute[it]);
-        cudaEventDestroy(stopsCompute[it]);
-
-        printf("Run %d; Total: %fms Comp.only: %fms\n", it, timesTotal[it], timesCompute[it]);
-    }
-
-    cudaProfilerStop();
-
-    for (auto& devptr : buffers)
-    {
-        CHECK(cudaFree(devptr));
-    }
-}
-
-// Create the Engine using only the API and not any parser.
-nvinfer1::ICudaEngine* fromAPIToModel(nvinfer1::IBuilder* builder, const int numHeads, const int B, const int S)
-{
-
-    builder->setMaxBatchSize(B);
-    builder->setMaxWorkspaceSize(5000_MB);
-    builder->setFp16Mode(gArgs.runInFp16);
-    if (gArgs.runInFp16)
-    {
-        gLogInfo << ("Running in FP 16 Mode\n");
-        builder->setStrictTypeConstraints(true);
-    }
-
-    nvinfer1::INetworkDefinition* network = builder->createNetwork();
-
-    WeightMap weightMap;
-
-    const std::string weightsPath(locateFile(BERT_WEIGHTS_NAME, gArgs.dataDirs));
-
-    loadWeights(weightsPath, weightMap);
-
-    // infer these from the parameters
-    int intermediateSize = 0;
-    int numHiddenLayers = 0;
-    int hiddenSize = 0;
-
-    inferNetworkSizes(weightMap, hiddenSize, intermediateSize, numHiddenLayers);
-
-    assert(intermediateSize);
-    assert(hiddenSize);
-    assert(numHiddenLayers);
-
-    /// Embeddings Layer
-
-    ITensor* inputIds = network->addInput("input_ids", DataType::kINT32, Dims{1, S});
-
-    ITensor* segmentIds = network->addInput("segment_ids", DataType::kINT32, Dims{1, S});
-
-    ITensor* inputMask = network->addInput("input_mask", DataType::kINT32, Dims{1, S});
-
-    const Weights& wBeta = weightMap.at("bert_embeddings_layernorm_beta");
-    const Weights& wGamma = weightMap.at("bert_embeddings_layernorm_gamma");
-    const Weights& wWordEmb = weightMap.at("bert_embeddings_word_embeddings");
-    const Weights& wTokEmb = weightMap.at("bert_embeddings_token_type_embeddings");
-    const Weights& wPosEmb = weightMap.at("bert_embeddings_position_embeddings");
-    ITensor* inputs[3] = {inputIds, segmentIds, inputMask};
-
-    auto embPlugin = EmbLayerNormPlugin("embeddings", gArgs.runInFp16, wBeta, wGamma, wWordEmb, wPosEmb, wTokEmb);
-    IPluginV2Layer* embLayer = network->addPluginV2(inputs, 3, embPlugin);
-    setOutputName(embLayer, "embeddings_", "output");
-
-    ITensor* embeddings = embLayer->getOutput(0);
-    ITensor* maskIdx = embLayer->getOutput(1);
-
-    /// BERT Encoder
-
-    const BertConfig config(numHeads, hiddenSize, intermediateSize, numHiddenLayers, gArgs.runInFp16);
-
-    ILayer* bertLayer = bertModel(config, weightMap, network, embeddings, maskIdx);
-
-    /// SQuAD Output Layer
-
-    ILayer* squadLayer = squad("cls_", config, weightMap, network, bertLayer->getOutput(0));
-
-    network->markOutput(*squadLayer->getOutput(0));
-
-    // Build the engine
-
-    auto engine = builder->buildCudaEngine(*network);
-    // we don't need the network any more
-    network->destroy();
-
-    // Once we have built the cuda engine, we can release all of our held memory.
-    for (auto& w : weightMap)
-        free(const_cast<void*>(w.second.values));
-    return engine;
-}
-
-nvinfer1::ICudaEngine* APIToModel(const int numHeads, const int B, const int S)
-{
-    // create the builder
-    nvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(gLogger.getTRTLogger());
-    assert(builder != nullptr);
-
-    // create the model to populate the network, then set the outputs and create an engine
-    nvinfer1::ICudaEngine* engine = fromAPIToModel(builder, numHeads, B, S);
-
-    assert(engine != nullptr);
-
-    builder->destroy();
-    return engine;
-}
+constexpr const char* gSampleName = "TensorRT.sample_bert";
+constexpr const char* kTEST_INPUT_FNAME = "test_inputs.weights_int32";
+constexpr const char* kTEST_OUTPUT_FNAME = "test_outputs.weights";
+constexpr const char* kBERT_WEIGHTS_FNAME = "bert.weights";
+constexpr int kNUM_RUNS = 10;
 
 //!
 //! \brief This function prints the help information for running this sample
 //!
 void printHelpInfo()
 {
-    std::cout << "Usage: ./sample_bert [-h or --help] [-d or --datadir=<path to data directory>] [--fp1 ]\n";
+    std::cout << "Usage: ./sample_bert --nheads=<number of heads> [-h or --help] [-d or --datadir=<path to data directory>] [--fp16] [--saveEngine=<path>]\n";
     std::cout << "--help          Display help information\n";
     std::cout << "--datadir       Specify path to a data directory, overriding the default. This option can be used "
-                 "multiple times to add multiple directories. The given path(s) must contain the weights and test "
-                 "inputs/outputs."
-              << std::endl;
-    std::cout << "--nheads        Number of attention heads." << std::endl;
-    std::cout << "--fp16          OPTIONAL: Run in FP16 mode." << std::endl;
-    std::cout << "--saveEngine    The path at which to write a serialized engine." << std::endl;
+            "multiple times to add multiple directories. The given path(s) must contain the weights and test "
+            "inputs/outputs.\n";
+    std::cout << "--nheads        Number of attention heads.\n";
+    std::cout << "--fp16          Run in FP16 mode.\n";
+    std::cout << "--saveEngine    The path at which to write a serialized engine." << endl;
 }
 
 int main(int argc, char* argv[])
 {
-
-    bool argsOK = parseArgs(gArgs, argc, argv);
+    const bool argsOK = parseArgs(gArgs, argc, argv);
     if (gArgs.help)
     {
         printHelpInfo();
@@ -241,123 +78,130 @@ int main(int argc, char* argv[])
     }
     if (!argsOK)
     {
-        gLogError << "Invalid arguments" << std::endl;
+        gLogError << "Invalid arguments" << endl;
         printHelpInfo();
         return EXIT_FAILURE;
     }
     if (gArgs.dataDirs.empty())
     {
-        gLogError << "No datadirs given" << std::endl;
+        gLogError << "No datadirs given" << endl;
         printHelpInfo();
         return EXIT_FAILURE;
     }
     if (gArgs.numHeads <= 0)
     {
-        gLogError << "invalid number of heads" << std::endl;
+        gLogError << "invalid number of heads" << endl;
         printHelpInfo();
         return EXIT_FAILURE;
     }
-    const std::string weightsPath(locateFile(TEST_OUTPUT_NAME, gArgs.dataDirs));
-    std::map<std::string, nvinfer1::Weights> testOutputs;
-    loadWeights(weightsPath, testOutputs);
 
-    std::vector<nvinfer1::Weights> inputIds;
-    std::vector<nvinfer1::Weights> inputMasks;
-    std::vector<nvinfer1::Weights> segmentIds;
-    std::vector<nvinfer1::Dims> inputDims;
-    std::string inputPath(locateFile(TEST_INPUT_NAME, gArgs.dataDirs));
+    gLogger.setReportableSeverity(Logger::Severity::kINFO);
+    auto sampleTest = gLogger.defineTest(gSampleName, argc, const_cast<const char**>(argv));
+    gLogger.reportTestStart(sampleTest);
+
+    // Load weights and golden files
+    const std::string outputPath(locateFile(kTEST_OUTPUT_FNAME, gArgs.dataDirs));
+    WeightMap testOutputs;
+    loadWeights(outputPath, testOutputs);
+
+    vector<Weights> inputIds;
+    vector<Weights> inputMasks;
+    vector<Weights> segmentIds;
+    vector<Dims> inputDims;
+    const std::string inputPath(locateFile(kTEST_INPUT_FNAME, gArgs.dataDirs));
 
     int S = 0;
     int Bmax = 0;
     loadInputs(inputPath, Bmax, S, inputIds, inputMasks, segmentIds, inputDims);
-    assert(inputIds.size() > 0);
+    assert(inputIds.size() > 0 && "No inputs found in supplied golden file");
 
-    auto sampleTest = gLogger.defineTest(gSampleName, argc, const_cast<const char**>(argv));
+    // Create optimization profiles. In this case, we only create a single profile for the shape we care about.
+    const int numHeads = gArgs.numHeads;
 
-    gLogger.reportTestStart(sampleTest);
+    const auto profile = std::make_tuple(Dims{2, Bmax, S}, Dims{2, Bmax, S}, Dims{2, Bmax, S});
+    OptProfileMap optProfileMap = {std::make_pair(kMODEL_INPUT0_NAME, profile),
+        std::make_pair(kMODEL_INPUT1_NAME, profile), std::make_pair(kMODEL_INPUT2_NAME, profile)};
 
-    const int numHeads = gArgs.numHeads;
-    nvinfer1::ICudaEngine* engine = APIToModel(numHeads, Bmax, S);
-    if (engine == nullptr)
-    {
-        gLogError << "Unable to build engine." << std::endl;
-        return gLogger.reportFail(sampleTest);
-    }
+    OptProfiles optProfiles = {optProfileMap};
 
-    if (!gArgs.saveEngine.empty())
-    {
-        std::ofstream engineFile(gArgs.saveEngine, std::ios::binary);
-        if (!engineFile)
-        {
-            gLogError << "Cannot open engine file: " << gArgs.saveEngine << std::endl;
-            return gLogger.reportFail(sampleTest);
-        }
-
-        nvinfer1::IHostMemory* serializedEngine{engine->serialize()};
-        if (serializedEngine == nullptr)
-        {
-            gLogError << "Engine serialization failed" << std::endl;
-            return false;
-        }
-
-        engineFile.write(static_cast<char*>(serializedEngine->data()), serializedEngine->size());
-        serializedEngine->destroy();
-    }
+    // Prepare the TRT Network
+    BERTDriver bertDriver(numHeads, gArgs.runInFp16, 5000_MiB, optProfiles);
+
+    const std::vector<size_t> inputShape(inputDims[0].d, inputDims[0].d + 2);
+    const HostTensorMap inCfg{
+        std::make_pair(kMODEL_INPUT0_NAME,
+            make_shared<HostTensor>(const_cast<void*>(inputIds[0].values), inputIds[0].type, inputShape)),
+        std::make_pair(kMODEL_INPUT1_NAME,
+            make_shared<HostTensor>(const_cast<void*>(segmentIds[0].values), segmentIds[0].type, inputShape)),
+        std::make_pair(kMODEL_INPUT2_NAME,
+            make_shared<HostTensor>(const_cast<void*>(inputMasks[0].values), inputMasks[0].type, inputShape))};
+
+    const int B = inputDims[0].d[0];
+
+    WeightMap weightMap;
 
-    nvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(gLogger);
-    if (runtime == nullptr)
+    const std::string weightsPath(locateFile(kBERT_WEIGHTS_FNAME, gArgs.dataDirs));
+    loadWeights(weightsPath, weightMap);
+
+    HostTensorMap params;
+    for (auto& kv : weightMap)
     {
-        gLogError << "Unable to create runtime." << std::endl;
-        return gLogger.reportFail(sampleTest);
+        std::vector<size_t> shape{static_cast<size_t>(kv.second.count)};
+        params[kv.first] = make_shared<HostTensor>(const_cast<void*>(kv.second.values), kv.second.type, shape);
     }
 
-    nvinfer1::IExecutionContext* context = engine->createExecutionContext();
-    if (context == nullptr)
+    // Build the TRT Engine
+    bertDriver.init(params);
+
+    if (!gArgs.saveEngine.empty())
     {
-        gLogError << "Unable to create context." << std::endl;
-        return gLogger.reportFail(sampleTest);
+        bertDriver.serializeEngine(gArgs.saveEngine);
     }
 
-    const std::map<std::string, nvinfer1::Weights> inCfg{std::make_pair("input_ids", inputIds[0]),
-        std::make_pair("input_mask", inputMasks[0]), std::make_pair("segment_ids", segmentIds[0])};
-    const int B = inputDims[0].d[0];
-
+    // Benchmark inference
     const std::string outputName("cls_squad_logits");
-    std::map<std::string, std::vector<float>> outCfg = {make_pair(outputName, std::vector<float>(2 * B * S))};
+    std::vector<float> output(2 * B * S);
+    HostTensorMap outCfg
+        = {make_pair(outputName, make_shared<HostTensor>(output.data(), DataType::kFLOAT, std::vector<size_t>{2, static_cast<size_t>(B), static_cast<size_t>(S)}))};
 
     cudaStream_t stream;
     cudaStreamCreate(&stream);
-    std::vector<float> timesTotal(NUM_RUNS);   // total time
-    std::vector<float> timesCompute(NUM_RUNS); // computation time
+    std::vector<float> timesTotal(kNUM_RUNS);   // Total time
+    std::vector<float> timesCompute(kNUM_RUNS); // Computation time
 
-    doInference(*context, inCfg, outCfg, B, stream, timesTotal, timesCompute);
+    bertDriver.benchmark(inCfg, outCfg, B, stream, timesTotal, timesCompute);
 
     cudaStreamDestroy(stream);
-    context->destroy();
-    engine->destroy();
-    runtime->destroy();
-    auto& output = outCfg[outputName];
     transposeLogits(output, B, S);
     const float* test = reinterpret_cast<const float*>(testOutputs["logits"].values);
 
-    float mae = 0;
+    // Analyze benchmark results
+    float meanAbsErr = 0;
     float maxdiff = 0;
     for (int it = 0; it < testOutputs["logits"].count; it++)
     {
-        const float diff = std::abs(test[it] - output[it]);
-        mae += diff;
-        maxdiff = std::max(diff, maxdiff);
+        const float diff = abs(test[it] - output[it]);
+        meanAbsErr += diff;
+        maxdiff = max(diff, maxdiff);
     }
-    const float avgTotal
-        = std::accumulate(timesTotal.begin(), timesTotal.end(), 0.f, std::plus<float>()) / timesTotal.size();
+    meanAbsErr /= output.size();
+
+    const float avgTotal = accumulate(timesTotal.begin(), timesTotal.end(), 0.f, plus<float>()) / timesTotal.size();
     const float avgCompute
-        = std::accumulate(timesCompute.begin(), timesCompute.end(), 0.f, std::plus<float>()) / timesCompute.size();
+        = accumulate(timesCompute.begin(), timesCompute.end(), 0.f, plus<float>()) / timesCompute.size();
 
-    printf("B=%d S=%d MAE=%.12e MaxDiff=%.12e ", B, S, (mae) / output.size(), maxdiff);
-    printf(" Runtime(total avg)=%.6fms Runtime(comp ms)=%.6f\n", avgTotal, avgCompute);
 
-    // destroy the engine
-    bool pass{true};
+    printf("B=%d S=%d MAE=%.12e MaxDiff=%.12e ", B, S, meanAbsErr, maxdiff);
+    printf(" Runtime(total avg)=%.6fms Runtime(comp ms)=%.6f\n", avgTotal, avgCompute);
 
+    bool pass{false};
+    if (gArgs.runInFp16)
+    {
+        pass = meanAbsErr < 2e-2;
+    }
+    else
+    {
+        pass = meanAbsErr < 1e-5;
+    }
     return gLogger.reportTest(sampleTest, pass);
 }
diff --git a/demo/BERT/util/bertUtils.h b/demo/BERT/util/bertUtils.h
index 0dbde7d8..6820f357 100644
--- a/demo/BERT/util/bertUtils.h
+++ b/demo/BERT/util/bertUtils.h
@@ -20,12 +20,13 @@
 #include "cuda_profiler_api.h"
 #include <getopt.h>
 #include <algorithm>
+#include <cassert>
+#include <iostream>
 
 namespace bert
 {
 
 using WeightMap = std::map<std::string, Weights>;
-using TensorMap = std::map<std::string, ITensor*>;
 
 struct BertConfig
 {
@@ -50,14 +51,12 @@ struct BertConfig
     }
 };
 
-Weights noop{DataType::kFLOAT, nullptr, 0};
-
-void setTensorName(ITensor* tensor, const std::string& prefix, const std::string& name)
+inline void setTensorName(ITensor* tensor, const std::string& prefix, const std::string& name)
 {
     tensor->setName((prefix + name).c_str());
 }
 
-void setOutputName(ILayer* layer, const std::string& prefix, const std::string& name, int out_idx = 0)
+inline void setOutputName(ILayer* layer, const std::string& prefix, const std::string& name, int out_idx = 0)
 {
     setTensorName(layer->getOutput(out_idx), prefix, name);
 }
@@ -134,7 +133,7 @@ inline bool parseArgs(Args& args, int argc, char* argv[])
     return true;
 }
 
-bool operator==(const nvinfer1::Dims& d1, const nvinfer1::Dims& d2)
+inline bool operator==(const nvinfer1::Dims& d1, const nvinfer1::Dims& d2)
 {
     if (d1.d == d2.d)
     {
diff --git a/demo/BERT/util/dataUtils.cpp b/demo/BERT/util/dataUtils.cpp
index 6dfd0c33..17142dbd 100644
--- a/demo/BERT/util/dataUtils.cpp
+++ b/demo/BERT/util/dataUtils.cpp
@@ -29,8 +29,6 @@ namespace bert
 
 using namespace nvinfer1;
 using namespace samplesCommon;
-using std::cout;
-using std::endl;
 
 void parseDims(std::ifstream& input, const std::string& name, Dims& d)
 {
@@ -39,7 +37,6 @@ void parseDims(std::ifstream& input, const std::string& name, Dims& d)
     {
         input >> d.d[it];
     }
-    cout << name << ": nbDim=" << d.nbDims << " dim: " << d << endl;
     assert(input.peek() == ' ');
     input.get();
 }
@@ -109,7 +106,7 @@ void loadWeights(const std::string& wts_path, WeightMap& weightMap)
     std::ifstream input(wts_path, std::ios_base::binary);
     int32_t count;
     input >> count;
-    cout << "Number of parameters: " << count << endl;
+    gLogInfo << "Number of parameters: " << count << std::endl;
 
     for (int it = 0; it < count; it++)
     {
@@ -126,7 +123,7 @@ void loadWeights(const std::string& wts_path, WeightMap& weightMap)
         // output as squad_output_weights
         if (name.find("kernel") != std::string::npos)
         {
-            cout << "Transposing\n";
+            gLogVerbose << "Transposing\n";
             transposeMatrix(data, d);
         }
         Weights tmp;
@@ -188,7 +185,7 @@ void loadInputs(const std::string& weightsPath, int& Bmax, int& S, std::vector<n
     std::ifstream input(weightsPath, std::ios_base::binary);
     int32_t count;
     input >> count;
-    cout << "Number of buffers: " << count << endl;
+    gLogInfo << "Number of buffers: " << count << std::endl;
     assert(count % 3 == 0);
     S = 0;
     Bmax = 0;
@@ -284,80 +281,6 @@ void inferNetworkSizes(const WeightMap& weightMap, int& hiddenSize, int& interme
     }
 }
 
-void allocBindingsFromWeights(const ICudaEngine& engine, std::vector<void*>& buffers, const int batchSize,
-    const std::map<std::string, nvinfer1::Weights>& dict, int verbose)
-{
-
-    Weights W;
-    std::string name;
-
-    for (auto& kv : dict)
-    {
-        std::tie(name, W) = kv;
-        const int idx = engine.getBindingIndex(name.c_str());
-        if (verbose)
-        {
-            printf(" idx %d name %s\n", idx, name.c_str());
-        }
-        assert(idx >= 0);
-        const int outlen = W.count * getElementSize(W.type);
-        CHECK(cudaMalloc(&buffers[idx], outlen));
-        if (verbose)
-        {
-            printf(" idx %d allocated %d bytes\n", idx, outlen);
-        }
-    }
-}
-
-void allocBindingsFromVectors(const ICudaEngine& engine, std::vector<void*>& buffers, const int batchSize,
-    const std::map<std::string, std::vector<float>>& dict, int verbose)
-{
-
-    for (auto& kv : dict)
-    {
-        const int idx = engine.getBindingIndex(kv.first.c_str());
-        if (verbose)
-        {
-            printf(" idx %d name %s\n", idx, kv.first.c_str());
-        }
-        assert(idx >= 0);
-        const int outlen = sizeof(float) * kv.second.size();
-        CHECK(cudaMalloc(&buffers[idx], outlen));
-        if (verbose)
-        {
-            printf(" idx %d allocated %d bytes\n", idx, outlen);
-        }
-    }
-}
-
-void copyToDeviceBindings(const ICudaEngine& engine, std::vector<void*>& buffers, const int batchSize,
-    const std::map<std::string, Weights>& dict, cudaStream_t stream)
-{
-
-    Weights W;
-    std::string name;
-
-    for (auto kv : dict)
-    {
-        std::tie(name, W) = kv;
-        const int idx = engine.getBindingIndex(name.c_str());
-        const int len = W.count * getElementSize(W.type);
-        CHECK(cudaMemcpyAsync(buffers[idx], W.values, len, cudaMemcpyHostToDevice, stream));
-    }
-}
-
-void copyFromDeviceBindings(const ICudaEngine& engine, std::vector<void*>& buffers, const int batchSize,
-    std::map<std::string, std::vector<float>>& dict, cudaStream_t stream)
-{
-    for (auto& kv : dict)
-    {
-        const int idx = engine.getBindingIndex(kv.first.c_str());
-        const int len = kv.second.size() * sizeof(float);
-        CHECK(cudaMemcpyAsync(&kv.second[0], buffers[idx], len, cudaMemcpyDeviceToHost, stream));
-        printf("Binding %s idx %d downloading %d bytes\n", kv.first.c_str(), idx, len);
-    }
-}
-
 void transposeLogits(std::vector<float>& logits, const int B, const int S)
 {
     // BxSx2 => 2xBxS
diff --git a/demo/BERT/util/dataUtils.h b/demo/BERT/util/dataUtils.h
index 6fa02d21..45a708f5 100644
--- a/demo/BERT/util/dataUtils.h
+++ b/demo/BERT/util/dataUtils.h
@@ -28,7 +28,7 @@ namespace bert
 
 //! \brief Loads a dictionary of weights
 //! \details The function loads the weights of the BERT network from a weights file. The Weights in the dictionary own
-//! the storage behind the Weights::values pointer. It is therefore the callers responsibility to free it. See also helpers/convert_weights.py 
+//! the storage behind the Weights::values pointer. It is therefore the callers responsibility to free it. See also helpers/convert_weights.py
 //!\param path path to inputs
 //!\param weightMap map of weights that the function will populate
 void loadWeights(const std::string& path, WeightMap& weightMap);
@@ -57,18 +57,6 @@ void loadInputs(const std::string& path, int& Bmax, int& S, std::vector<nvinfer1
 void inferNetworkSizes(const WeightMap& weightMap, int& hiddenSize,
     int& intermediateSize, int& numHiddenLayers);
 
-void allocBindingsFromWeights(const nvinfer1::ICudaEngine& engine, std::vector<void*>& buffers, const int batchSize,
-    const std::map<std::string, nvinfer1::Weights>& dict, int verbose);
-
-void allocBindingsFromVectors(const nvinfer1::ICudaEngine& engine, std::vector<void*>& buffers, const int batchSize,
-    const std::map<std::string, std::vector<float>>& dict, int verbose);
-
-void copyToDeviceBindings(const nvinfer1::ICudaEngine& engine, std::vector<void*>& buffers, const int batchSize,
-    const std::map<std::string, nvinfer1::Weights>& dict, cudaStream_t stream);
-
-void copyFromDeviceBindings(const nvinfer1::ICudaEngine& engine, std::vector<void*>& buffers, const int batchSize,
-    std::map<std::string, std::vector<float>>& dict, cudaStream_t stream);
-
 //! \brief Transposes logits from BxSx2 to 2xBxS
 //! \details Due to a limitation of TensorRT, the network itself cannot transpose the output of the squad logits
 //! into the desired shape. This function performs the transpose on the input in-place.
diff --git a/docker/ubuntu-18.04-cross-aarch64.Dockerfile b/docker/ubuntu-18.04-cross-aarch64.Dockerfile
new file mode 100644
index 00000000..608841de
--- /dev/null
+++ b/docker/ubuntu-18.04-cross-aarch64.Dockerfile
@@ -0,0 +1,90 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#ARG CUDA_VERSION=10.1
+#FROM ubuntu:18.04 
+FROM nvidia/cuda:10.0-devel-ubuntu18.04
+
+LABEL maintainer="NVIDIA CORPORATION"
+
+# Install requried libraries
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    libcurl4-openssl-dev \
+    wget \
+    zlib1g-dev \
+    git \
+    pkg-config \
+    python3 \
+    python3-pip
+
+RUN cd /usr/local/bin &&\
+    ln -s /usr/bin/python3 python &&\
+    ln -s /usr/bin/pip3 pip
+
+# Install Cmake
+RUN cd /tmp &&\
+    wget https://github.com/Kitware/CMake/releases/download/v3.14.4/cmake-3.14.4-Linux-x86_64.sh &&\
+    chmod +x cmake-3.14.4-Linux-x86_64.sh &&\
+    ./cmake-3.14.4-Linux-x86_64.sh --prefix=/usr/local --exclude-subdir --skip-license &&\
+    rm ./cmake-3.14.4-Linux-x86_64.sh
+
+        
+COPY docker/jetpack_files /pdk_files
+COPY scripts/stubify.sh /pdk_files
+
+# Install CUDA cross compile toolchain
+RUN dpkg -i /pdk_files/cuda-repo-cross-aarch64-10-0-local-10.0.326_1.0-1_all.deb /pdk_files/cuda-repo-ubuntu1804-10-0-local-10.0.326-410.108_1.0-1_amd64.deb \
+    && apt-get update \
+    && apt-get install -y cuda-cross-aarch64 \
+    && rm -rf /var/lib/apt/lists/*
+
+# Unpack cudnn
+RUN dpkg -x /pdk_files/libcudnn7_7.5.0.56-1+cuda10.0_arm64.deb /pdk_files/cudnn  \
+    && dpkg -x /pdk_files/libcudnn7-dev_7.5.0.56-1+cuda10.0_arm64.deb /pdk_files/cudnn \
+    && cd /pdk_files/cudnn/usr/include/aarch64-linux-gnu \
+    && cd /pdk_files/cudnn/usr/lib/aarch64-linux-gnu \
+    && ln -s libcudnn.so.7 libcudnn.so \
+    && cd /pdk_files/cudnn \
+    && ln -s usr/include/aarch64-linux-gnu include \
+    && ln -s usr/lib/aarch64-linux-gnu lib \ 
+    && ln -s /pdk_files/cudnn/usr/include/aarch64-linux-gnu/cudnn_v7.h /usr/include/cudnn.h
+
+# Unpack libnvinfer
+#
+RUN dpkg -x /pdk_files/libnvinfer6_6.0.1-1+cuda10.0_arm64.deb /pdk_files/tensorrt \
+    && dpkg -x /pdk_files/libnvinfer-dev_6.0.1-1+cuda10.0_arm64.deb /pdk_files/tensorrt \
+    && dpkg -x /pdk_files/libnvparsers6_6.0.1-1+cuda10.0_arm64.deb /pdk_files/tensorrt \
+    && dpkg -x /pdk_files/libnvparsers-dev_6.0.1-1+cuda10.0_arm64.deb /pdk_files/tensorrt \
+    && dpkg -x /pdk_files/libnvinfer-plugin6_6.0.1-1+cuda10.0_arm64.deb /pdk_files/tensorrt \
+    && dpkg -x /pdk_files/libnvinfer-plugin-dev_6.0.1-1+cuda10.0_arm64.deb /pdk_files/tensorrt \
+    && dpkg -x /pdk_files/libnvonnxparsers6_6.0.1-1+cuda10.0_arm64.deb /pdk_files/tensorrt \
+    && dpkg -x /pdk_files/libnvonnxparsers-dev_6.0.1-1+cuda10.0_arm64.deb /pdk_files/tensorrt 
+
+# create stub libraries 
+RUN cd /pdk_files/tensorrt \
+    && ln -s usr/include/aarch64-linux-gnu include \
+    && ln -s usr/lib/aarch64-linux-gnu lib \
+    && cd lib \
+    && mkdir stubs \
+    && for x in nvinfer nvparsers nvinfer_plugin nvonnxparser; \
+       do                                                     \
+            CC=aarch64-linux-gnu-gcc /pdk_files/stubify.sh lib${x}.so stubs/lib${x}.so \
+       ; done
+
+# Set environment and working directory
+ENV TRT_RELEASE /pdk_files/tensorrt
+ENV TRT_SOURCE /workspace/TensorRT
+WORKDIR /workspace
+
+RUN ["/bin/bash"]
diff --git a/include/NvCaffeParser.h b/include/NvCaffeParser.h
index 0ff49511..eaecf1d6 100644
--- a/include/NvCaffeParser.h
+++ b/include/NvCaffeParser.h
@@ -19,11 +19,11 @@
 
 #include "NvInfer.h"
 
-namespace ditcaffe
-{
-class NetParameter;
-}
-
+//!
+//! \namespace nvcaffeparser1
+//!
+//! \brief The TensorRT Caffe parser API namespace.
+//!
 namespace nvcaffeparser1
 {
 
@@ -47,7 +47,7 @@ class IBlobNameToTensor
     //!
     //! \return ITensor* corresponding to the queried name. If no such ITensor exists, then nullptr is returned.
     //!
-    virtual nvinfer1::ITensor* find(const char* name) const = 0;
+    virtual nvinfer1::ITensor* find(const char* name) const TRTNOEXCEPT = 0;
 
 protected:
     virtual ~IBlobNameToTensor() {}
@@ -65,10 +65,10 @@ class IBlobNameToTensor
 class IBinaryProtoBlob
 {
 public:
-    virtual const void* getData() = 0;
-    virtual nvinfer1::DimsNCHW getDimensions() = 0;
-    virtual nvinfer1::DataType getDataType() = 0;
-    virtual void destroy() = 0;
+    virtual const void* getData() TRTNOEXCEPT = 0;
+    virtual nvinfer1::DimsNCHW getDimensions() TRTNOEXCEPT = 0;
+    virtual nvinfer1::DataType getDataType() TRTNOEXCEPT = 0;
+    virtual void destroy() TRTNOEXCEPT = 0;
 
 protected:
     virtual ~IBinaryProtoBlob() {}
@@ -87,7 +87,7 @@ class IPluginFactory
     //!
     //! \param layerName Name of the layer which the user wishes to validate.
     //!
-    virtual bool isPlugin(const char* layerName) = 0;
+    virtual bool isPlugin(const char* layerName) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Creates a plugin.
@@ -96,7 +96,7 @@ class IPluginFactory
     //! \param weights Weights used for the layer.
     //! \param nbWeights Number of weights.
     //!
-    virtual nvinfer1::IPlugin* createPlugin(const char* layerName, const nvinfer1::Weights* weights, int nbWeights) = 0;
+    virtual nvinfer1::IPlugin* createPlugin(const char* layerName, const nvinfer1::Weights* weights, int nbWeights) TRTNOEXCEPT = 0;
 };
 
 //!
@@ -107,7 +107,7 @@ class IPluginFactory
 class IPluginFactoryExt : public IPluginFactory
 {
 public:
-    virtual int getVersion() const
+    virtual int getVersion() const TRTNOEXCEPT
     {
         return NV_TENSORRT_VERSION;
     }
@@ -117,7 +117,7 @@ class IPluginFactoryExt : public IPluginFactory
     //!
     //! \param layerName Name of the layer which the user wishes to validate.
     //!
-    virtual bool isPluginExt(const char* layerName) = 0;
+    virtual bool isPluginExt(const char* layerName) TRTNOEXCEPT = 0;
 };
 
 //!
@@ -133,7 +133,7 @@ class IPluginFactoryV2
     //!
     //! \param layerName Name of the layer which the user wishes to validate.
     //!
-    virtual bool isPluginV2(const char* layerName) = 0;
+    virtual bool isPluginV2(const char* layerName) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Creates a plugin.
@@ -143,7 +143,7 @@ class IPluginFactoryV2
     //! \param nbWeights Number of weights.
     //! \param libNamespace Library Namespace associated with the plugin object
     //!
-    virtual nvinfer1::IPluginV2* createPlugin(const char* layerName, const nvinfer1::Weights* weights, int nbWeights, const char* libNamespace = "") = 0;
+    virtual nvinfer1::IPluginV2* createPlugin(const char* layerName, const nvinfer1::Weights* weights, int nbWeights, const char* libNamespace = "") TRTNOEXCEPT = 0;
 };
 //!
 //! \class ICaffeParser
@@ -159,9 +159,9 @@ class ICaffeParser
 public:
     //!
     //! \brief Parse a prototxt file and a binaryproto Caffe model to extract
-    //!   network configuration and weights associated with the network, respectively.
+    //!   network definition and weights associated with the network, respectively.
     //!
-    //! \param deploy The plain text, prototxt file used to define the network configuration.
+    //! \param deploy The plain text, prototxt file used to define the network definition.
     //! \param model The binaryproto Caffe model that contains the weights associated with the network.
     //! \param network Network in which the CaffeParser will fill the layers.
     //! \param weightType The type to which the weights will transformed.
@@ -173,14 +173,13 @@ class ICaffeParser
     virtual const IBlobNameToTensor* parse(const char* deploy,
                                            const char* model,
                                            nvinfer1::INetworkDefinition& network,
-                                           nvinfer1::DataType weightType)
-        = 0;
+                                           nvinfer1::DataType weightType) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Parse a deploy prototxt a binaryproto Caffe model from memory buffers to extract
-    //!   network configuration and weights associated with the network, respectively.
+    //!   network definition and weights associated with the network, respectively.
     //!
-    //! \param deployBuffer The plain text deploy prototxt used to define the network configuration.
+    //! \param deployBuffer The plain text deploy prototxt used to define the network definition.
     //! \param deployLength The length of the deploy buffer.
     //! \param modelBuffer The binaryproto Caffe memory buffer that contains the weights associated with the network.
     //! \param modelLength The length of the model buffer.
@@ -196,7 +195,7 @@ class ICaffeParser
                                                   const char* modelBuffer,
                                                   std::size_t modelLength,
                                                   nvinfer1::INetworkDefinition& network,
-                                                  nvinfer1::DataType weightType) = 0;
+                                                  nvinfer1::DataType weightType) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Parse and extract data stored in binaryproto file.
@@ -210,7 +209,7 @@ class ICaffeParser
     //!
     //! \see nvcaffeparser1::IBinaryProtoBlob
     //!
-    virtual IBinaryProtoBlob* parseBinaryProto(const char* fileName) = 0;
+    virtual IBinaryProtoBlob* parseBinaryProto(const char* fileName) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set buffer size for the parsing and storage of the learned model.
@@ -219,41 +218,68 @@ class ICaffeParser
     //!
     //! \note  Default size is 2^30 bytes.
     //!
-    virtual void setProtobufBufferSize(size_t size) = 0;
+    virtual void setProtobufBufferSize(size_t size) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the IPluginFactory used to create the user defined plugins.
     //!
     //! \param factory Pointer to an instance of the user implmentation of IPluginFactory.
     //!
-    virtual void setPluginFactory(IPluginFactory* factory) = 0;
+    virtual void setPluginFactory(IPluginFactory* factory) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the IPluginFactoryExt used to create the user defined pluginExts.
     //!
     //! \param factory Pointer to an instance of the user implmentation of IPluginFactoryExt.
     //!
-    virtual void setPluginFactoryExt(IPluginFactoryExt* factory) = 0;
+    virtual void setPluginFactoryExt(IPluginFactoryExt* factory) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Destroy this ICaffeParser object.
     //!
-    virtual void destroy() = 0;
+    virtual void destroy() TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the IPluginFactoryV2 used to create the user defined pluginV2 objects.
     //!
     //! \param factory Pointer to an instance of the user implmentation of IPluginFactoryV2.
     //!
-    virtual void setPluginFactoryV2(IPluginFactoryV2* factory) = 0;
+    virtual void setPluginFactoryV2(IPluginFactoryV2* factory) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the namespace used to lookup and create plugins in the network.
     //!
-    virtual void setPluginNamespace(const char* libNamespace) = 0;
+    virtual void setPluginNamespace(const char* libNamespace) TRTNOEXCEPT = 0;
 
 protected:
     virtual ~ICaffeParser() {}
+
+public:
+    //!
+    //! \brief Set the ErrorRecorder for this interface
+    //!
+    //! Assigns the ErrorRecorder to this interface. The ErrorRecorder will track all errors during execution.
+    //! This function will call incRefCount of the registered ErrorRecorder at least once. Setting 
+    //! recorder to nullptr unregisters the recorder with the interface, resulting in a call to decRefCount if
+    //! a recorder has been registered.
+    //! 
+    //! \param recorder The error recorder to register with this interface.
+    //
+    //! \see getErrorRecorder
+    //! 
+    virtual void setErrorRecorder(nvinfer1::IErrorRecorder* recorder) TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief get the ErrorRecorder assigned to this interface.
+    //!
+    //! Retrieves the assigned error recorder object for the given class. A default error recorder does not exist,
+    //! so a nullptr will be returned if setErrorRecorder has not been called.
+    //!
+    //! \return A pointer to the IErrorRecorder object that has been registered.
+    //!
+    //! \see setErrorRecorder
+    //!
+    virtual nvinfer1::IErrorRecorder* getErrorRecorder() const TRTNOEXCEPT = 0;
 };
 
 //!
@@ -263,14 +289,14 @@ class ICaffeParser
 //!
 //! \see nvcaffeparser1::ICaffeParser
 //!
-TENSORRTAPI ICaffeParser* createCaffeParser();
+TENSORRTAPI ICaffeParser* createCaffeParser() TRTNOEXCEPT;
 
 //!
 //! \brief Shuts down protocol buffers library.
 //!
 //! \note No part of the protocol buffers library can be used after this function is called.
 //!
-TENSORRTAPI void shutdownProtobufLibrary();
+TENSORRTAPI void shutdownProtobufLibrary() TRTNOEXCEPT;
 }
 
 extern "C" TENSORRTAPI void* createNvCaffeParser_INTERNAL();
diff --git a/include/NvInfer.h b/include/NvInfer.h
index 86ff3de4..808d8dbd 100644
--- a/include/NvInfer.h
+++ b/include/NvInfer.h
@@ -17,44 +17,17 @@
 #ifndef NV_INFER_H
 #define NV_INFER_H
 
-#include <cstddef>
-#include <cstdint>
-
-#define NV_TENSORRT_MAJOR 5 //!< TensorRT major version.
-#define NV_TENSORRT_MINOR 1 //!< TensorRT minor version.
-#define NV_TENSORRT_PATCH 5 //!< TensorRT patch version.
-#define NV_TENSORRT_BUILD 0 //!< TensorRT build number.
-
-#define NV_TENSORRT_SONAME_MAJOR 5 //!< Shared object library major version number.
-#define NV_TENSORRT_SONAME_MINOR 1 //!< Shared object library minor version number.
-#define NV_TENSORRT_SONAME_PATCH 5 //!< Shared object library patch version number.
-
-#if __cplusplus > 201103L
-#define _TENSORRT_FINAL final
-#define _TENSORRT_OVERRIDE override
-#else
-#define _TENSORRT_FINAL
-#define _TENSORRT_OVERRIDE
-#endif
-
-//!< Defines which symbols are exported
-#ifdef TENSORRT_BUILD_LIB
-#ifdef _MSC_VER
-#define TENSORRTAPI __declspec(dllexport)
-#else
-#define TENSORRTAPI __attribute__((visibility("default")))
-#endif
-#else
-#define TENSORRTAPI
-#endif
+#include "NvInferRuntime.h"
 
 //!
 //! \mainpage
 //!
-//! This is the API documentation for the NVIDIA TensorRT library. It provides information on individual functions, classes
-//! and methods. Use the index on the left to navigate the documentation.
+//! This is the API documentation for the NVIDIA TensorRT library. It provides information on individual
+//! functions, classes and methods. Use the index on the left to navigate the documentation.
 //!
-//! Please see the accompanying user guide and samples for higher-level information and general advice on using TensorRT.
+//! Please see the accompanying user guide and samples for higher-level information and general advice on
+//! using TensorRT.
+//
 //! TensorRT Versioning follows Semantic Versioning Guidelines specified here: https://semver.org/
 //!
 
@@ -64,16 +37,6 @@
 //! This is the top-level API file for TensorRT.
 //!
 
-// forward declare some CUDA types to avoid an include dependency
-
-struct cublasContext;
-struct cudnnContext;
-
-typedef struct CUstream_st* cudaStream_t; //!< Forward declaration of cudaStream_t.
-typedef struct CUevent_st* cudaEvent_t;   //!< Forward declaration of cudaEvent_t.
-
-static const int NV_TENSORRT_VERSION = (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + NV_TENSORRT_PATCH; // major, minor, patch
-
 //!
 //! \namespace nvinfer1
 //!
@@ -82,78 +45,6 @@ static const int NV_TENSORRT_VERSION = (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT
 namespace nvinfer1
 {
 
-template <typename T>
-inline int EnumMax(); //!< Maximum number of elements in an enumeration type.
-
-//!
-//! \enum DataType
-//! \brief The type of weights and tensors.
-//!
-enum class DataType : int
-{
-    kFLOAT = 0, //!< FP32 format.
-    kHALF = 1,  //!< FP16 format.
-    kINT8 = 2,  //!< quantized INT8 format.
-    kINT32 = 3  //!< INT32 format.
-};
-
-template <>
-inline int EnumMax<DataType>()
-{
-    return 4;
-} //!< Maximum number of elements in DataType enum. \see DataType
-
-//!
-//! \enum DeviceType
-//! \brief The device that this layer/network will execute on.
-//!
-//!
-enum class DeviceType : int
-{
-    kGPU, //!< GPU Device
-    kDLA, //!< DLA Core
-};
-template <>
-inline int EnumMax<DeviceType>()
-{
-    return 2;
-} //!< Maximum number of elements in DeviceType enum. \see DeviceType
-
-//!
-//! \enum DimensionType
-//! \brief The type of data encoded across this dimension.
-//!
-enum class DimensionType : int
-{
-    kSPATIAL = 0, //!< Elements correspond to different spatial data.
-    kCHANNEL = 1, //!< Elements correspond to different channels.
-    kINDEX = 2,   //!< Elements correspond to different batch index.
-    kSEQUENCE = 3 //!< Elements correspond to different sequence values.
-};
-
-template <>
-inline int EnumMax<DimensionType>()
-{
-    return 4;
-} //!< Maximum number of elements in DimensionType enum. \see DimensionType
-
-//!
-//! \class Dims
-//! \brief Structure to define the dimensions of a tensor.
-//!
-//! \note: Currently the following formats are supported for layer inputs and outputs:
-//! * zero or more index dimensions followed by one channel and two spatial dimensions (e.g. CHW)
-//! * one time series dimension followed by one index dimension followed by one channel dimension (i.e. TNC)
-//!
-class Dims
-{
-public:
-    static const int MAX_DIMS = 8; //!< The maximum number of dimensions supported for a tensor.
-    int nbDims;                    //!< The number of dimensions.
-    int d[MAX_DIMS];               //!< The extent of each dimension.
-    DimensionType type[MAX_DIMS];  //!< The type of each dimension.
-};
-
 //!
 //! \class Dims2
 //! \brief Descriptor for two-dimensional data.
@@ -277,7 +168,9 @@ class Dims3 : public Dims
 //! \class DimsCHW
 //! \brief Descriptor for data with one channel dimension and two spatial dimensions.
 //!
-class DimsCHW : public Dims3
+//! \deprecated DimsCHW will be removed in a future version of TensorRT, use Dims3 instead.
+//!
+class TRT_DEPRECATED DimsCHW : public Dims3
 {
 public:
     //!
@@ -385,7 +278,9 @@ class Dims4 : public Dims
 //! \class DimsNCHW
 //! \brief Descriptor for data with one index dimension, one channel dimension and two spatial dimensions.
 //!
-class DimsNCHW : public Dims4
+//! \deprecated DimsNCHW will be removed in a future version of TensorRT, use Dims instead.
+//!
+class TRT_DEPRECATED DimsNCHW : public Dims4
 {
 public:
     //!
@@ -472,43 +367,6 @@ class DimsNCHW : public Dims4
     int w() const { return d[3]; }
 };
 
-//!
-//! \class Weights
-//!
-//! \brief An array of weights used as a layer parameter.
-//!
-//! The weights are held by reference until the engine has been built. Therefore the data referenced
-//! by \p values field should be preserved until the build is complete.
-//!
-class Weights
-{
-public:
-    DataType type;      //!< The type of the weights.
-    const void* values; //!< The weight values, in a contiguous array.
-    int64_t count;      //!< The number of weights in the array.
-};
-
-//!
-//! \class IHostMemory
-//!
-//! \brief Class to handle library allocated memory that is accessible to the user.
-//!
-//! The memory allocated via the host memory object is owned by the library and will
-//! be de-allocated when the destroy method is called.
-//!
-//! \warning Do not inherit from this class, as doing so will break forward-compatibility of the API and ABI.
-//!
-class IHostMemory
-{
-public:
-    virtual void* data() const = 0;       //!< A pointer to the raw data that is owned by the library.
-    virtual std::size_t size() const = 0; //!< The size in bytes of the data that was allocated.
-    virtual DataType type() const = 0;    //!< The type of the memory that was allocated.
-    virtual void destroy() = 0;           //!< Destroy the allocated memory.
-protected:
-    virtual ~IHostMemory() {}
-};
-
 //!
 //! \enum LayerType
 //!
@@ -542,31 +400,18 @@ enum class LayerType : int
     kRNN_V2 = 21,          //!< RNNv2 layer.
     kIDENTITY = 22,        //!< Identity layer.
     kPLUGIN_V2 = 23,       //!< PluginV2 layer.
-    kSLICE = 24            //!< Slice layer.
+    kSLICE = 24,           //!< Slice layer.
+    kSHAPE = 25,           //!< Shape layer.
+    kPARAMETRIC_RELU = 26, //!< Parametric ReLU layer.
+    kRESIZE = 27           //!< Resize Layer.
 };
 
 template <>
-inline int EnumMax<LayerType>()
+constexpr inline int EnumMax<LayerType>()
 {
-    return 25;
+    return 28;
 } //!< Maximum number of elements in LayerType enum. \see LayerType
 
-//!
-//! \enum TensorLocation
-//! \brief The location for tensor data storage, device or host.
-//!
-enum class TensorLocation : int
-{
-    kDEVICE = 0, //!< Data stored on device.
-    kHOST = 1    //!< Data stored on host.
-};
-
-template <>
-inline int EnumMax<TensorLocation>()
-{
-    return 2;
-} //!< Maximum number of elements in TensorLocation enum. \see TensorLocation
-
 //!
 //! \class ITensor
 //!
@@ -591,7 +436,7 @@ class ITensor
     //!
     //! \see getName()
     //!
-    virtual void setName(const char* name) = 0;
+    virtual void setName(const char* name) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the tensor name.
@@ -600,7 +445,7 @@ class ITensor
     //!
     //! \see setName()
     //!
-    virtual const char* getName() const = 0;
+    virtual const char* getName() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the dimensions of a tensor.
@@ -609,38 +454,36 @@ class ITensor
     //! the layer parameters and the inputs to the layer. If a tensor size or a parameter is modified in the network,
     //! the dimensions of all dependent tensors will be recomputed.
     //!
-    //! This call is only legal for network input tensors, since the dimensions of layer output tensors are inferred based on
-    //! layer inputs and parameters.
+    //! This call is only legal for network input tensors, since the dimensions of layer output tensors are inferred
+    //! based on layer inputs and parameters.
     //!
     //! \param dimensions The dimensions of the tensor.
     //!
     //! \see getDimensions()
     //!
-    virtual void setDimensions(Dims dimensions) = 0; // only valid for input tensors
+    virtual void setDimensions(Dims dimensions) TRTNOEXCEPT = 0; // only valid for input tensors
 
     //!
     //! \brief Get the dimensions of a tensor.
     //!
     //! \return The dimensions of the tensor.
     //!
+    //! \warning getDimensions() returns a -1 for dimensions that are derived from a wildcard dimension.
     //! \see setDimensions()
     //!
-    virtual Dims getDimensions() const = 0;
+    virtual Dims getDimensions() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the data type of a tensor.
     //!
     //! \param type The data type of the tensor.
     //!
-    //! The type is unchanged if the type is
-    //! invalid for the given tensor.
-    //!
-    //! If the tensor is a network input or output,
-    //! then the tensor type cannot be DataType::kINT8.
+    //! The type is unchanged if the tensor is not a network input tensor, or marked as an output tensor or shape
+    //! output tensor.
     //!
     //! \see getType()
     //!
-    virtual void setType(DataType type) = 0;
+    virtual void setType(DataType type) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the data type of a tensor.
@@ -649,7 +492,7 @@ class ITensor
     //!
     //! \see setType()
     //!
-    virtual DataType getType() const = 0;
+    virtual DataType getType() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set dynamic range for the tensor
@@ -661,7 +504,7 @@ class ITensor
     //!
     //! Requires that min and max be finite, and min <= max.
     //!
-    virtual bool setDynamicRange(float min, float max) = 0;
+    virtual bool setDynamicRange(float min, float max) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get dynamic range for the tensor
@@ -670,17 +513,17 @@ class ITensor
     //!
     //! \deprecated This interface is superceded by getDynamicRangeMin and getDynamicRangeMax.
     //!
-    virtual float getDynamicRange() const = 0;
+    TRT_DEPRECATED virtual float getDynamicRange() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Whether the tensor is a network input.
     //!
-    virtual bool isNetworkInput() const = 0;
+    virtual bool isNetworkInput() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Whether the tensor is a network output.
     //!
-    virtual bool isNetworkOutput() const = 0;
+    virtual bool isNetworkOutput() const TRTNOEXCEPT = 0;
 
 protected:
     virtual ~ITensor() {}
@@ -692,72 +535,124 @@ class ITensor
     //! When a tensor is broadcast across a batch, it has the same value for every member in the batch.
     //! Memory is only allocated once for the single member.
     //!
-    //! This method is only valid for network input tensors, since the flags of layer output tensors are inferred based on
-    //! layer inputs and parameters.
+    //! This method is only valid for network input tensors, since the flags of layer output tensors are inferred based
+    //! on layer inputs and parameters.
     //! If this state is modified for a tensor in the network, the states of all dependent tensors will be recomputed.
+    //! If the tensor is for an explicit batch network, then this function does nothing.
+    //!
+    //! \warning The broadcast flag is ignored when using explicit batch network mode.
     //!
     //! \param broadcastAcrossBatch Whether to enable broadcast of tensor across the batch.
     //!
     //! \see getBroadcastAcrossBatch()
     //!
-    virtual void setBroadcastAcrossBatch(bool broadcastAcrossBatch) = 0;
+    virtual void setBroadcastAcrossBatch(bool broadcastAcrossBatch) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Check if tensor is broadcast across the batch.
     //!
     //! When a tensor is broadcast across a batch, it has the same value for every member in the batch.
-    //! Memory is only allocated once for the single member.
+    //! Memory is only allocated once for the single member. If the network is in explicit batch mode,
+    //! this function returns true if the leading dimension is 1.
     //!
     //! \return True if tensor is broadcast across the batch, false otherwise.
     //!
     //! \see setBroadcastAcrossBatch()
     //!
-    virtual bool getBroadcastAcrossBatch() const = 0;
+    virtual bool getBroadcastAcrossBatch() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the storage location of a tensor.
     //! \return The location of tensor data.
     //! \see setLocation()
     //!
-    virtual TensorLocation getLocation() const = 0;
+    virtual TensorLocation getLocation() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the storage location of a tensor
     //! \param location the location of tensor data
     //!
-    //! Only input tensors for storing sequence lengths for RNNv2 are supported.
+    //! Only network input tensors for storing sequence lengths for RNNv2 are supported.
     //! Using host storage for layers that do not support it will generate
     //! errors at build time.
     //!
     //! \see getLocation()
     //!
-    virtual void setLocation(TensorLocation location) = 0;
+    virtual void setLocation(TensorLocation location) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Query whether dynamic range is set.
     //!
     //! \return True if dynamic range is set, false otherwise.
     //!
-    virtual bool dynamicRangeIsSet() const = 0;
+    virtual bool dynamicRangeIsSet() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Undo effect of setDynamicRange.
     //!
-    virtual void resetDynamicRange() = 0;
+    virtual void resetDynamicRange() TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get minimum of dynamic range.
     //!
     //! \return Minimum of dynamic range, or quiet NaN if range was not set.
     //!
-    virtual float getDynamicRangeMin() const = 0;
+    virtual float getDynamicRangeMin() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get maximum of dynamic range.
     //!
     //! \return Maximum of dynamic range, or quiet NaN if range was not set.
     //!
-    virtual float getDynamicRangeMax() const = 0;
+    virtual float getDynamicRangeMax() const TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Set allowed formats for this tensor. By default all formats are allowed.
+    //!        Shape tensors (for which isShapeTensor() returns true) may only have row major linear format.
+    //!
+    //! \param formats A bitmask of TensorFormat values that are supported for this tensor.
+    //!
+    //! \see ITensor::getAllowedFormats()
+    //!
+    virtual void setAllowedFormats(TensorFormats formats) TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Get a bitmask of TensorFormat values that the tensor supports.
+    //!        For a shape tensor, only row major linear format is allowed.
+    //!
+    //! \return The value specified by setAllowedFormats or all possible formats.
+    //!
+    //! \see ITensor::setAllowedFormats()
+    //!
+    virtual TensorFormats getAllowedFormats() const TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Whether the tensor is a shape tensor.
+    //!
+    //! If a tensor is a shape tensor and becomes an engine input or output,
+    //! then ICudaEngine::isShapeBinding will be true for that tensor.
+    //!
+    //! It is possible for a tensor to be both a shape tensor and an execution tensor.
+    //!
+    //! \return True if tensor is a shape tensor, false otherwise.
+    //!
+    virtual bool isShapeTensor() const TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Whether the tensor is an execution tensor.
+    //!
+    //! If a tensor is an execution tensor and becomes an engine input or output,
+    //! then ICudaEngine::isExecutionBinding will be true for that tensor.
+    //!
+    //! Tensors are usually execution tensors.  The exceptions are tensors used
+    //! solely for shape calculations or whose contents not needed to compute the outputs.
+    //!
+    //! A tensor with isShapeTensor() == false and isExecutionTensor() == false
+    //! can still show up as an input to the engine if its dimensions are required.
+    //! In that case, only its dimensions need to be set at runtime and a nullptr
+    //! can be passed instead of a pointer to its contents.
+    //!
+    virtual bool isExecutionTensor() const TRTNOEXCEPT = 0;
 };
 
 //!
@@ -775,7 +670,7 @@ class ILayer
     //!
     //! \see LayerType
     //!
-    virtual LayerType getType() const = 0;
+    virtual LayerType getType() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the name of a layer.
@@ -784,7 +679,7 @@ class ILayer
     //!
     //! \see getName()
     //!
-    virtual void setName(const char* name) = 0;
+    virtual void setName(const char* name) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Return the name of a layer.
@@ -792,56 +687,65 @@ class ILayer
 
     //! \see setName()
     //!
-    virtual const char* getName() const = 0;
+    virtual const char* getName() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the number of inputs of a layer.
     //!
-    virtual int getNbInputs() const = 0;
+    virtual int getNbInputs() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the layer input corresponding to the given index.
     //!
     //! \param index The index of the input tensor.
     //!
-    //! \return The input tensor, or nullptr if the index is out of range or the tensor is optional(\ref IRNNLayer and \ref IRNNv2Layer).
+    //! \return The input tensor, or nullptr if the index is out of range or the tensor is optional
+    //! (\ref IRNNLayer and \ref IRNNv2Layer).
     //!
-    virtual ITensor* getInput(int index) const = 0;
+    virtual ITensor* getInput(int index) const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the number of outputs of a layer.
     //!
-    virtual int getNbOutputs() const = 0;
+    virtual int getNbOutputs() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the layer output corresponding to the given index.
     //!
-    //! \return The indexed output tensor, or nullptr if the index is out of range or the tensor is optional(\ref IRNNLayer and \ref IRNNv2Layer).
+    //! \return The indexed output tensor, or nullptr if the index is out of range or the tensor is optional
+    //! (\ref IRNNLayer and \ref IRNNv2Layer).
     //!
-    virtual ITensor* getOutput(int index) const = 0;
+    virtual ITensor* getOutput(int index) const TRTNOEXCEPT = 0;
 
     //!
     //! \brief replace an input of this layer with a specific tensor
     //!
-    //! Note that this method cannot change the number of inputs to a layer.  The index argument must be less
-    //! than the value of getNbInputs()
+    //! Except of IShuffleLayer and ISliceLayer, this method cannot change the number of inputs to a layer.
+    //! The index argument must be less than the value of getNbInputs().
+    //!
+    //! See comments for IShuffleLayer::setInput() and ISliceLayer::setInput() for their special behavior.
     //!
     //! \param index the index of the input to modify.
     //! \param tensor the new input tensor
     //!
-    virtual void setInput(int index, ITensor& tensor) = 0;
+    virtual void setInput(int index, ITensor& tensor) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the computational precision of this layer
     //!
-    //! setting the precision forces TensorRT to choose implementations which run at this precision. If precision is not set,
-    //! TensorRT will select the computational precision based on performance considerations and the flags specified to the builder.
+    //! Setting the precision allows TensorRT to choose implementation which run at this computational precision.
+    //! Layer input type would also get inferred from layer computational precision. TensorRT could still choose a
+    //! non-conforming fastest implementation ignoring set layer precision. Use BuilderFlag::kSTRICT_TYPES to force
+    //! choose implementations with requested precision. In case no implementation is found with requested precision,
+    //! TensorRT would choose available fastest implementation. If precision is not set, TensorRT will select the layer
+    //! computational precision and layer input type based on performance considerations and the flags specified to the
+    //! builder.
     //!
     //! \param precision the computational precision.
     //!
     //! \see getPrecision() precisionIsSet() resetPrecision()
 
-    virtual void setPrecision(DataType dataType) = 0;
+    virtual void setPrecision(DataType dataType) TRTNOEXCEPT = 0;
 
     //!
     //! \brief get the computational precision of this layer
@@ -850,7 +754,7 @@ class ILayer
     //!
     //! \see setPrecision() precisionIsSet() resetPrecision()
 
-    virtual DataType getPrecision() const = 0;
+    virtual DataType getPrecision() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief whether the computational precision has been set for this layer
@@ -859,27 +763,40 @@ class ILayer
     //!
     //! \see setPrecision() getPrecision() resetPrecision()
 
-    virtual bool precisionIsSet() const = 0;
+    virtual bool precisionIsSet() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief reset the computational precision for this layer
     //!
     //! \see setPrecision() getPrecision() precisionIsSet()
 
-    virtual void resetPrecision() = 0;
+    virtual void resetPrecision() TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the output type of this layer
     //!
-    //! setting the output type constrains TensorRT to choose implementations which generate output data with the given type.
-    //! If it is not set, TensorRT will select the implementation based on performance considerations and the flags specified to the builder.
+    //! Setting the output type constrains TensorRT to choose implementations which generate output data with the
+    //! given type. If it is not set, TensorRT will select output type based on layer computational precision. TensorRT
+    //! could still choose non-conforming output type based on fastest implementation. Use BuilderFlag::kSTRICT_TYPES to
+    //! force choose requested output type. In case layer precision is not specified, output type would depend on
+    //! choosen implementation based on performance considerations and the flags specified to the builder. Note that
+    //! this method cannot be used to set the data type of the second output tensor of the topK layer. The data type of
+    //! the second output tensor of the topK layer is always Int32. Also the output type of all layers that are shape
+    //! operations must be DataType::kINT32, and all attempts to set the output type to some other data type will be
+    //! ignored except for issuing an error message.
+    //!
+    //! Note that the layer output type is generally not identical to the data type of the output tensor, as TensorRT may insert
+    //! implicit reformatting operations to convert the former to the latter. Calling layer->setOutputType(i, type)
+    //! has no effect on the data type of the i-th output tensor of layer, and users need to call layer->getOutput(i)->setType(type)
+    //! to change the tensor data type. This is particularly relevant if the tensor is marked as a network output, since only
+    //! setType() [but not setOutputType()] will affect the data representation in the corresponding output binding.
     //!
     //! \param index the index of the output to set
     //! \param dataType the type of the output
     //!
     //! \see getOutputType() outputTypeIsSet() resetOutputType()
 
-    virtual void setOutputType(int index, DataType dataType) = 0;
+    virtual void setOutputType(int index, DataType dataType) TRTNOEXCEPT = 0;
 
     //!
     //! \brief get the output type of this layer
@@ -890,7 +807,7 @@ class ILayer
     //!
     //! \see getOutputType() outputTypeIsSet() resetOutputType()
 
-    virtual DataType getOutputType(int index) const = 0;
+    virtual DataType getOutputType(int index) const TRTNOEXCEPT = 0;
 
     //!
     //! \brief whether the output type has been set for this layer
@@ -900,7 +817,7 @@ class ILayer
     //!
     //! \see setOutputType() getOutputType() resetOutputType()
 
-    virtual bool outputTypeIsSet(int index) const = 0;
+    virtual bool outputTypeIsSet(int index) const TRTNOEXCEPT = 0;
 
     //!
     //! \brief reset the output type for this layer
@@ -909,7 +826,7 @@ class ILayer
     //!
     //! \see setOutputType() getOutputType() outputTypeIsSet()
 
-    virtual void resetOutputType(int index) = 0;
+    virtual void resetOutputType(int index) TRTNOEXCEPT = 0;
 
 protected:
     virtual ~ILayer() {}
@@ -919,12 +836,12 @@ class ILayer
 //! \enum PaddingMode
 //!
 //! \brief Enumerates the modes of padding to perform in convolution, deconvolution and pooling layer,
-//! padding mode gets precedence if setPaddingMode() and setPrePadding() are also used.
+//! padding mode takes precedence if setPaddingMode() and setPrePadding() are also used.
 //!
 //! kEXPLICIT* padding is to use explicit padding.
-//! kSAME* padding is to implicitly calculate padding to keep output dim to be the "same" with input dim. For convolution and pooling,
-//! output dim is ceil(input dim, stride), for deconvolution it is inverse, then use the output dim to calculate padding size.
-//! kCAFFE* padding is symmetric padding.
+//! kSAME* padding is to implicitly calculate padding to keep output dim to be the "same" with input dim. For
+//! convolution and pooling, output dim is ceil(input dim, stride), for deconvolution it is inverse, then use
+//! the output dim to calculate padding size. kCAFFE* padding is symmetric padding.
 //!
 enum class PaddingMode : int
 {
@@ -937,7 +854,7 @@ enum class PaddingMode : int
 };
 
 template <>
-inline int EnumMax<PaddingMode>()
+constexpr inline int EnumMax<PaddingMode>()
 {
     return 6;
 } //!< Maximum number of elements in PaddingMode enum. \see PaddingMode
@@ -947,9 +864,8 @@ inline int EnumMax<PaddingMode>()
 //!
 //! \brief A convolution layer in a network definition.
 //!
-//! This layer performs a correlation operation between 3-dimensional filter with a 4-dimensional tensor to produce another 4-dimensional tensor.
-//!
-//! The HW output size of the convolution is set according to the \p INetworkCustomDimensions set in INetworkDefinition::setCustomConvolutionDimensions().
+//! This layer performs a correlation operation between 3-dimensional filter with a 4-dimensional tensor to produce
+//! another 4-dimensional tensor.
 //!
 //! An optional bias argument is supported, which adds a per-channel constant to each value in the output.
 //!
@@ -965,14 +881,14 @@ class IConvolutionLayer : public ILayer
     //!
     //! \see getKernelSize()
     //!
-    virtual void setKernelSize(DimsHW kernelSize) = 0;
+    virtual void setKernelSize(DimsHW kernelSize) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the HW kernel size of the convolution.
     //!
     //! \see setKernelSize()
     //!
-    virtual DimsHW getKernelSize() const = 0;
+    virtual DimsHW getKernelSize() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the number of output maps for the convolution.
@@ -981,14 +897,14 @@ class IConvolutionLayer : public ILayer
     //!
     //! \see getNbOutputMaps()
     //!
-    virtual void setNbOutputMaps(int nbOutputMaps) = 0;
+    virtual void setNbOutputMaps(int nbOutputMaps) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the number of output maps for the convolution.
     //!
     //! \see setNbOutputMaps()
     //!
-    virtual int getNbOutputMaps() const = 0;
+    virtual int getNbOutputMaps() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the stride of the convolution.
@@ -997,19 +913,20 @@ class IConvolutionLayer : public ILayer
     //!
     //! If executing this layer on DLA, both height and width of stride must be in the range [1,8].
     //!
-    //! \see setStride()
+    //! \see getStride()
     //!
-    virtual void setStride(DimsHW stride) = 0;
+    virtual void setStride(DimsHW stride) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the stride of the convolution.
     //!
-    virtual DimsHW getStride() const = 0;
+    virtual DimsHW getStride() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the padding of the convolution.
     //!
-    //! The input will be zero-padded by this number of elements in the height and width directions. Padding is symmetric.
+    //! The input will be zero-padded by this number of elements in the height and width directions.
+    //! Padding is symmetric.
     //!
     //! Default: (0,0)
     //!
@@ -1017,70 +934,73 @@ class IConvolutionLayer : public ILayer
     //!
     //! \see getPadding()
     //!
-    virtual void setPadding(DimsHW padding) = 0;
+    virtual void setPadding(DimsHW padding) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the padding of the convolution. If the padding is asymmetric, the pre-padding is returned.
     //!
     //! \see setPadding()
     //!
-    virtual DimsHW getPadding() const = 0;
+    virtual DimsHW getPadding() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the number of groups for a convolution.
     //!
-    //! The input tensor channels are  divided into \p nbGroups groups, and a convolution is executed for each group, using a filter per group. The results of the group
-    //! convolutions are concatenated to form the output.
+    //! The input tensor channels are  divided into \p nbGroups groups, and a convolution is executed for each group,
+    //! using a filter per group. The results of the group convolutions are concatenated to form the output.
     //!
-    //! \note When using groups in int8 mode, the size of the groups (i.e. the channel count divided by the group count) must be a multiple of 4 for both input and output.
+    //! \note When using groups in int8 mode, the size of the groups (i.e. the channel count divided by the group
+    //! count) must be a multiple of 4 for both input and output.
     //!
     //! Default: 1
     //!
     //! \see getNbGroups()
     //!
-    virtual void setNbGroups(int nbGroups) = 0;
+    virtual void setNbGroups(int nbGroups) TRTNOEXCEPT = 0;
 
     //!
-    //! \brief Set the number of groups for a convolution.
+    //! \brief Get the number of groups of the convolution.
     //!
     //! \see setNbGroups()
     //!
-    virtual int getNbGroups() const = 0;
+    virtual int getNbGroups() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the kernel weights for the convolution.
     //!
-    //! The weights are specified as a contiguous array in \p GKCRS order, where \p G is the number of groups, \p K the number of output feature maps, \p C the number of
-    //! input channels, and \p R and \p S are the height and width of the filter.
+    //! The weights are specified as a contiguous array in \p GKCRS order, where \p G is the number of groups, \p K
+    //! the number of output feature maps, \p C the number of input channels, and \p R and \p S are the height and
+    //! width of the filter.
     //!
     //! \see getKernelWeights()
     //!
-    virtual void setKernelWeights(Weights weights) = 0;
+    virtual void setKernelWeights(Weights weights) TRTNOEXCEPT = 0;
 
     //!
-    //! \brief Get the kernel weights for the convolution.
+    //! \brief Get the kernel weights of the convolution.
     //!
     //! \see setKernelWeights()
     //!
-    virtual Weights getKernelWeights() const = 0;
+    virtual Weights getKernelWeights() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the bias weights for the convolution.
     //!
     //! Bias is optional. To omit bias, set the count value of the weights structure to zero.
     //!
-    //! The bias is applied per-channel, so the number of weights (if non-zero) must be equal to the number of output feature maps.
+    //! The bias is applied per-channel, so the number of weights (if non-zero) must be equal to the number of output
+    //! feature maps.
     //!
     //! \see getBiasWeights()
     //!
-    virtual void setBiasWeights(Weights weights) = 0;
+    virtual void setBiasWeights(Weights weights) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the bias weights for the convolution.
     //!
     //! \see setBiasWeights()
     //!
-    virtual Weights getBiasWeights() const = 0;
+    virtual Weights getBiasWeights() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the dilation for a convolution.
@@ -1089,14 +1009,14 @@ class IConvolutionLayer : public ILayer
     //!
     //! \see getDilation()
     //!
-    virtual void setDilation(DimsHW dims) = 0;
+    virtual void setDilation(DimsHW dilation) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the dilation for a convolution.
     //!
     //! \see setDilation()
     //!
-    virtual DimsHW getDilation() const = 0;
+    virtual DimsHW getDilation() const TRTNOEXCEPT = 0;
 
 protected:
     virtual ~IConvolutionLayer() {}
@@ -1113,14 +1033,14 @@ class IConvolutionLayer : public ILayer
     //!
     //! \see getPrePadding()
     //!
-    virtual void setPrePadding(Dims padding) = 0;
+    virtual void setPrePadding(Dims padding) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the pre-padding.
     //!
     //! \see setPrePadding()
     //!
-    virtual Dims getPrePadding() const = 0;
+    virtual Dims getPrePadding() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the post-padding.
@@ -1133,24 +1053,25 @@ class IConvolutionLayer : public ILayer
     //!
     //! \see getPostPadding()
     //!
-    virtual void setPostPadding(Dims padding) = 0;
+    virtual void setPostPadding(Dims padding) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the post-padding.
     //!
     //! \see setPostPadding()
     //!
-    virtual Dims getPostPadding() const = 0;
+    virtual Dims getPostPadding() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the padding mode.
     //!
-    //! Padding mode gets precedence if both setPaddingMode and setPre/PostPadding are used.
+    //! Padding mode takes precedence if both setPaddingMode and setPre/PostPadding are used.
     //!
     //! Default: kEXPLICIT_ROUND_DOWN
     //!
     //! \see getPaddingMode()
-    virtual void setPaddingMode(PaddingMode paddingMode) = 0;
+    //!
+    virtual void setPaddingMode(PaddingMode paddingMode) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the padding mode.
@@ -1158,7 +1079,81 @@ class IConvolutionLayer : public ILayer
     //! Default: kEXPLICIT_ROUND_DOWN
     //!
     //! \see setPaddingMode()
-    virtual PaddingMode getPaddingMode() const = 0;
+    //!
+    virtual PaddingMode getPaddingMode() const TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Set the multi-dimension kernel size of the convolution.
+    //!
+    //! If executing this layer on DLA, only support 2D kernel size, both height and width of kernel size must be in the range [1,16].
+    //!
+    //! \see getKernelSizeNd() setKernelSize() getKernelSize()
+    //!
+    virtual void setKernelSizeNd(Dims kernelSize) TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Get the multi-dimension kernel size of the convolution.
+    //!
+    //! \see setKernelSizeNd()
+    //!
+    virtual Dims getKernelSizeNd() const TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Set the multi-dimension stride of the convolution.
+    //!
+    //! Default: (1, 1, ..., 1)
+    //!
+    //! If executing this layer on DLA, only support 2D stride, both height and width of stride must be in the range [1,8].
+    //!
+    //! \see getStrideNd() setStride() getStride()
+    //!
+    virtual void setStrideNd(Dims stride) TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Get the multi-dimension stride of the convolution.
+    //!
+    //! \see setStrideNd()
+    //!
+    virtual Dims getStrideNd() const TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Set the multi-dimension padding of the convolution.
+    //!
+    //! The input will be zero-padded by this number of elements in each dimension.
+    //! Padding is symmetric.
+    //!
+    //! Default: (0, 0, ..., 0)
+    //!
+    //! If executing this layer on DLA, only support 2D padding, both height and width of padding must be in the range [0,15].
+    //!
+    //! \see getPaddingNd() setPadding() getPadding()
+    //!
+    virtual void setPaddingNd(Dims padding) TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Get the multi-dimension padding of the convolution.
+    //!
+    //! If the padding is asymmetric, the pre-padding is returned.
+    //!
+    //! \see setPaddingNd()
+    //!
+    virtual Dims getPaddingNd() const TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Set the multi-dimension dilation of the convolution.
+    //!
+    //! Default: (1, 1, ..., 1)
+    //!
+    //! \see getDilation()
+    //!
+    virtual void setDilationNd(Dims dilation) TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Get the multi-dimension dilation of the convolution.
+    //!
+    //! \see setDilation()
+    //!
+    virtual Dims getDilationNd() const TRTNOEXCEPT = 0;
 };
 
 //! \class IFullyConnectedLayer
@@ -1200,28 +1195,28 @@ class IFullyConnectedLayer : public ILayer
     //!
     //! \see getNbOutputChannels()
     //!
-    virtual void setNbOutputChannels(int nbOutputs) = 0;
+    virtual void setNbOutputChannels(int nbOutputs) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the number of output channels `K` from the fully connected layer.
     //!
     //! \see setNbOutputChannels()
     //!
-    virtual int getNbOutputChannels() const = 0;
+    virtual int getNbOutputChannels() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the kernel weights, given as a `KxC` matrix in row-major order.
     //!
     //! \see getKernelWeights()
     //!
-    virtual void setKernelWeights(Weights weights) = 0;
+    virtual void setKernelWeights(Weights weights) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the kernel weights.
     //!
     //! \see setKernelWeights()
     //!
-    virtual Weights getKernelWeights() const = 0;
+    virtual Weights getKernelWeights() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the bias weights.
@@ -1230,46 +1225,19 @@ class IFullyConnectedLayer : public ILayer
     //!
     //! \see getBiasWeightsWeights()
     //!
-    virtual void setBiasWeights(Weights weights) = 0;
+    virtual void setBiasWeights(Weights weights) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the bias weights.
     //!
     //! \see setBiasWeightsWeights()
     //!
-    virtual Weights getBiasWeights() const = 0;
+    virtual Weights getBiasWeights() const TRTNOEXCEPT = 0;
 
 protected:
     virtual ~IFullyConnectedLayer() {}
 };
 
-//!
-//! \enum ActivationType
-//!
-//! \brief Enumerates the types of activation to perform in an activation layer.
-//!
-enum class ActivationType : int
-{
-    kRELU = 0,             //!< Rectified linear activation.
-    kSIGMOID = 1,          //!< Sigmoid activation.
-    kTANH = 2,             //!< TanH activation.
-    kLEAKY_RELU = 3,       //!< LeakyRelu activation: x>=0 ? x : alpha * x.
-    kELU = 4,              //!< Elu activation: x>=0 ? x : alpha * (exp(x) - 1).
-    kSELU = 5,             //!< Selu activation: x>0 ? beta * x : beta * (alpha*exp(x) - alpha)
-    kSOFTSIGN = 6,         //!< Softsign activation: x / (1+|x|)
-    kSOFTPLUS = 7,         //!< Parametric softplus activation: alpha*log(exp(beta*x)+1)
-    kCLIP = 8,             //!< Clip activation: max(alpha, min(beta, x))
-    kHARD_SIGMOID = 9,     //!< Hard sigmoid activation: max(0, min(1, alpha*x+beta))
-    kSCALED_TANH = 10,     //!< Scaled tanh activation: alpha*tanh(beta*x)
-    kTHRESHOLDED_RELU = 11 //!< Thresholded ReLU activation: x>alpha : x : 0
-};
-
-template <>
-inline int EnumMax<ActivationType>()
-{
-    return 12;
-} //!< Maximum number of elements in ActivationType enum. \see ActivationType
-
 //!
 //! \class IActivationLayer
 //!
@@ -1289,14 +1257,14 @@ class IActivationLayer : public ILayer
     //!
     //! \see getActivationType(), ActivationType
     //!
-    virtual void setActivationType(ActivationType type) = 0;
+    virtual void setActivationType(ActivationType type) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the type of activation to be performed.
     //!
     //! \see setActivationType(), ActivationType
     //!
-    virtual ActivationType getActivationType() const = 0;
+    virtual ActivationType getActivationType() const TRTNOEXCEPT = 0;
 
 protected:
     virtual ~IActivationLayer() {}
@@ -1311,7 +1279,7 @@ class IActivationLayer : public ILayer
     //! It is ignored by the other activations.
     //!
     //! \see getAlpha(), setBeta()
-    virtual void setAlpha(float alpha) = 0;
+    virtual void setAlpha(float alpha) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the beta parameter (must be finite).
@@ -1322,19 +1290,19 @@ class IActivationLayer : public ILayer
     //! It is ignored by the other activations.
     //!
     //! \see getBeta(), setAlpha()
-    virtual void setBeta(float beta) = 0;
+    virtual void setBeta(float beta) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the alpha parameter.
     //!
     //! \see getBeta(), setAlpha()
-    virtual float getAlpha() const = 0;
+    virtual float getAlpha() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the beta parameter.
     //!
     //! \see getAlpha(), setBeta()
-    virtual float getBeta() const = 0;
+    virtual float getBeta() const TRTNOEXCEPT = 0;
 };
 
 //!
@@ -1346,11 +1314,11 @@ enum class PoolingType : int
 {
     kMAX = 0,              // Maximum over elements
     kAVERAGE = 1,          // Average over elements. If the tensor is padded, the count includes the padding
-    kMAX_AVERAGE_BLEND = 2 // Blending between the max pooling and average pooling: (1-blendFactor)*maxPool + blendFactor*avgPool
+    kMAX_AVERAGE_BLEND = 2 // Blending between max and average pooling: (1-blendFactor)*maxPool + blendFactor*avgPool
 };
 
 template <>
-inline int EnumMax<PoolingType>()
+constexpr inline int EnumMax<PoolingType>()
 {
     return 3;
 } //!< Maximum number of elements in PoolingType enum. \see PoolingType
@@ -1361,8 +1329,6 @@ inline int EnumMax<PoolingType>()
 //!
 //! The layer applies a reduction operation within a window over the input.
 //!
-//! The output size is determined from the input size using the formula set by INetworkDefinition::setCustomPoolingDimensions().
-//!
 //! \warning Do not inherit from this class, as doing so will break forward-compatibility of the API and ABI.
 //!
 class IPoolingLayer : public ILayer
@@ -1375,14 +1341,14 @@ class IPoolingLayer : public ILayer
     //!
     //! \see getPoolingType(), PoolingType
     //!
-    virtual void setPoolingType(PoolingType type) = 0;
+    virtual void setPoolingType(PoolingType type) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the type of activation to be performed.
     //!
     //! \see setPoolingType(), PoolingType
     //!
-    virtual PoolingType getPoolingType() const = 0;
+    virtual PoolingType getPoolingType() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the window size for pooling.
@@ -1391,14 +1357,14 @@ class IPoolingLayer : public ILayer
     //!
     //! \see getWindowSize()
     //!
-    virtual void setWindowSize(DimsHW windowSize) = 0;
+    virtual void setWindowSize(DimsHW windowSize) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the window size for pooling.
     //!
     //! \see setWindowSize()
     //!
-    virtual DimsHW getWindowSize() const = 0;
+    virtual DimsHW getWindowSize() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the stride for pooling.
@@ -1409,14 +1375,14 @@ class IPoolingLayer : public ILayer
     //!
     //! \see getStride()
     //!
-    virtual void setStride(DimsHW stride) = 0;
+    virtual void setStride(DimsHW stride) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the stride for pooling.
     //!
     //! \see setStride()
     //!
-    virtual DimsHW getStride() const = 0;
+    virtual DimsHW getStride() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the padding for pooling.
@@ -1425,53 +1391,57 @@ class IPoolingLayer : public ILayer
     //!
     //! If executing this layer on DLA, both height and width of padding must be in the range [0,7].
     //!
-    //! \see getStride()
+    //! \see getPadding()
     //!
-    virtual void setPadding(DimsHW padding) = 0;
+    virtual void setPadding(DimsHW padding) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the padding for pooling.
     //!
     //! Default: 0
     //!
-    //! \see getStride()
+    //! \see setPadding()
     //!
-    virtual DimsHW getPadding() const = 0;
+    virtual DimsHW getPadding() const TRTNOEXCEPT = 0;
 
     //!
-    //! \brief Set the blending factor for the max_average_blend mode: max_average_blendPool = (1-blendFactor)*maxPool + blendFactor*avgPool
+    //! \brief Set the blending factor for the max_average_blend mode:
+    //! max_average_blendPool = (1-blendFactor)*maxPool + blendFactor*avgPool
     //! blendFactor is a user value in [0,1] with the default value of 0.0
     //! This value only applies for the kMAX_AVERAGE_BLEND mode.
     //!
     //! \see getBlendFactor()
     //!
-    virtual void setBlendFactor(float blendFactor) = 0;
+    virtual void setBlendFactor(float blendFactor) TRTNOEXCEPT = 0;
 
     //!
-    //! \brief Get the blending factor for the max_average_blend mode: max_average_blendPool = (1-blendFactor)*maxPool + blendFactor*avgPool
+    //! \brief Get the blending factor for the max_average_blend mode:
+    //! max_average_blendPool = (1-blendFactor)*maxPool + blendFactor*avgPool
     //! blendFactor is a user value in [0,1] with the default value of 0.0
     //! In modes other than kMAX_AVERAGE_BLEND, blendFactor is ignored.
     //!
     //! \see setBlendFactor()
     //!
-    virtual float getBlendFactor() const = 0;
+    virtual float getBlendFactor() const TRTNOEXCEPT = 0;
 
     //!
-    //! \brief Set whether average pooling uses as a denominator the overlap area between the window and the unpadded input.
+    //! \brief Set whether average pooling uses as a denominator the overlap area between the window
+    //! and the unpadded input.
     //! If this is not set, the denominator is the overlap between the pooling window and the padded input.
     //!
     //! Default: true
     //!
     //! \see getAverageCountExcludesPadding()
     //!
-    virtual void setAverageCountExcludesPadding(bool exclusive) = 0;
+    virtual void setAverageCountExcludesPadding(bool exclusive) TRTNOEXCEPT = 0;
 
     //!
-    //! \brief Get whether exclusive pooling uses as a denominator the overlap area betwen the window and the unpadded input.
+    //! \brief Get whether exclusive pooling uses as a denominator the overlap area betwen the window
+    //! and the unpadded input.
     //!
     //! \see setAverageCountExcludesPadding()
     //!
-    virtual bool getAverageCountExcludesPadding() const = 0;
+    virtual bool getAverageCountExcludesPadding() const TRTNOEXCEPT = 0;
 
 protected:
     virtual ~IPoolingLayer() {}
@@ -1488,14 +1458,14 @@ class IPoolingLayer : public ILayer
     //!
     //! \see getPadding()
     //!
-    virtual void setPrePadding(Dims padding) = 0;
+    virtual void setPrePadding(Dims padding) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the pre-padding.
     //!
     //! \see setPrePadding()
     //!
-    virtual Dims getPrePadding() const = 0;
+    virtual Dims getPrePadding() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the post-padding.
@@ -1508,24 +1478,24 @@ class IPoolingLayer : public ILayer
     //!
     //! \see getPadding()
     //!
-    virtual void setPostPadding(Dims padding) = 0;
+    virtual void setPostPadding(Dims padding) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the padding.
     //!
     //! \see setPadding()
     //!
-    virtual Dims getPostPadding() const = 0;
+    virtual Dims getPostPadding() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the padding mode.
     //!
-    //! Padding mode gets precedence if both setPaddingMode and setPre/PostPadding are used.
+    //! Padding mode takes precedence if both setPaddingMode and setPre/PostPadding are used.
     //!
     //! Default: kEXPLICIT_ROUND_DOWN
     //!
     //! \see getPaddingMode()
-    virtual void setPaddingMode(PaddingMode paddingMode) = 0;
+    virtual void setPaddingMode(PaddingMode paddingMode) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the padding mode.
@@ -1533,7 +1503,64 @@ class IPoolingLayer : public ILayer
     //! Default: kEXPLICIT_ROUND_DOWN
     //!
     //! \see setPaddingMode()
-    virtual PaddingMode getPaddingMode() const = 0;
+    virtual PaddingMode getPaddingMode() const TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Set the multi-dimension window size for pooling.
+    //!
+    //! If executing this layer on DLA, only support 2D window size, both height and width of window size must be in the range [1,8].
+    //!
+    //! \see getWindowSizeNd() setWindowSize() getWindowSize()
+    //!
+    virtual void setWindowSizeNd(Dims windowSize) TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Get the multi-dimension window size for pooling.
+    //!
+    //! \see setWindowSizeNd()
+    //!
+    virtual Dims getWindowSizeNd() const TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Set the multi-dimension stride for pooling.
+    //!
+    //! Default: (1, 1, ..., 1)
+    //!
+    //! If executing this layer on DLA, only support 2D stride, both height and width of stride must be in the range [1,16].
+    //!
+    //! \see getStrideNd() setStride() getStride()
+    //!
+    virtual void setStrideNd(Dims stride) TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Get the multi-dimension stride for pooling.
+    //!
+    //! \see setStrideNd()
+    //!
+    virtual Dims getStrideNd() const TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Set the multi-dimension padding for pooling.
+    //!
+    //! The input will be zero-padded by this number of elements in each dimension.
+    //! Padding is symmetric.
+    //!
+    //! Default: (0, 0, ..., 0)
+    //!
+    //! If executing this layer on DLA, only support 2D padding, both height and width of padding must be in the range [0,7].
+    //!
+    //! \see getPaddingNd() setPadding() getPadding()
+    //!
+    virtual void setPaddingNd(Dims padding) TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Get the multi-dimension padding for pooling.
+    //!
+    //! If the padding is asymmetric, the pre-padding is returned.
+    //!
+    //! \see setPaddingNd()
+    //!
+    virtual Dims getPaddingNd() const TRTNOEXCEPT = 0;
 };
 
 //!
@@ -1554,14 +1581,14 @@ class ILRNLayer : public ILayer
     //! The window size must be odd and in the range of [1, 15].
     //! \see setWindowStride()
     //!
-    virtual void setWindowSize(int windowSize) = 0;
+    virtual void setWindowSize(int windowSize) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the LRN window size.
     //!
     //! \see getWindowStride()
     //!
-    virtual int getWindowSize() const = 0;
+    virtual int getWindowSize() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the LRN alpha value.
@@ -1569,14 +1596,14 @@ class ILRNLayer : public ILayer
     //! The valid range is [-1e20, 1e20].
     //! \see getAlpha()
     //!
-    virtual void setAlpha(float alpha) = 0;
+    virtual void setAlpha(float alpha) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the LRN alpha value.
     //!
     //! \see setAlpha()
     //!
-    virtual float getAlpha() const = 0;
+    virtual float getAlpha() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the LRN beta value.
@@ -1584,14 +1611,14 @@ class ILRNLayer : public ILayer
     //! The valid range is [0.01, 1e5f].
     //! \see getBeta()
     //!
-    virtual void setBeta(float beta) = 0;
+    virtual void setBeta(float beta) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the LRN beta value.
     //!
     //! \see setBeta()
     //!
-    virtual float getBeta() const = 0;
+    virtual float getBeta() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the LRN K value.
@@ -1599,14 +1626,14 @@ class ILRNLayer : public ILayer
     //! The valid range is [1e-5, 1e10].
     //! \see getK()
     //!
-    virtual void setK(float k) = 0;
+    virtual void setK(float k) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the LRN K value.
     //!
     //! \see setK()
     //!
-    virtual float getK() const = 0;
+    virtual float getK() const TRTNOEXCEPT = 0;
 
 protected:
     virtual ~ILRNLayer() {}
@@ -1620,12 +1647,12 @@ class ILRNLayer : public ILayer
 enum class ScaleMode : int
 {
     kUNIFORM = 0,    //!< Identical coefficients across all elements of the tensor.
-    kCHANNEL = 1,    //!< Per-channel coefficients. The channel dimension is assumed to be the third to last dimension
+    kCHANNEL = 1,    //!< Per-channel coefficients.
     kELEMENTWISE = 2 //!< Elementwise coefficients.
 };
 
 template <>
-inline int EnumMax<ScaleMode>()
+constexpr inline int EnumMax<ScaleMode>()
 {
     return 3;
 } //!< Maximum number of elements in ScaleMode enum. \see ScaleMode
@@ -1660,59 +1687,74 @@ class IScaleLayer : public ILayer
     //!
     //! \see getMode()
     //!
-    virtual void setMode(ScaleMode mode) = 0;
+    virtual void setMode(ScaleMode mode) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the scale mode.
     //!
     //! \see setMode()
     //!
-    virtual ScaleMode getMode() const = 0;
+    virtual ScaleMode getMode() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the shift value.
     //!
     //! \see getShift()
     //!
-    virtual void setShift(Weights shift) = 0;
+    virtual void setShift(Weights shift) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the shift value.
     //!
     //! \see setShift()
     //!
-    virtual Weights getShift() const = 0;
+    virtual Weights getShift() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the scale value.
     //!
     //! \see getScale()
     //!
-    virtual void setScale(Weights scale) = 0;
+    virtual void setScale(Weights scale) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the scale value.
     //!
     //! \see setScale()
     //!
-    virtual Weights getScale() const = 0;
+    virtual Weights getScale() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the power value.
     //!
     //! \see getPower()
     //!
-    virtual void setPower(Weights power) = 0;
+    virtual void setPower(Weights power) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the power value.
     //!
     //! \see setPower()
     //!
-    virtual Weights getPower() const = 0;
+    virtual Weights getPower() const TRTNOEXCEPT = 0;
 
 protected:
     virtual ~IScaleLayer() {}
+
+public:
+    //!
+    //! \brief Get the channel axis.
+    //!
+    //! \return channelAxis parameter passed to addScaleNd()
+    //!
+    //! The value is the index of the channel axis in the input tensor's dimensions. All dimensions
+    //! after the channel axis are assumed to be spatial dimensions, and the only spatial dimensions
+    //! in the tensor. The number of spatial dimensions is thus getDimensions().nbDims - channelAxis - 1.
+    //! Supported numbers of spatial dimensions are 2 and 3 for 2d and 3d scale layers respectively.
+    //!
+    //! \see addScaleNd()
+    //!
+    virtual int getChannelAxis() const TRTNOEXCEPT = 0;
 };
 
 //!
@@ -1734,26 +1776,40 @@ class ISoftMaxLayer : public ILayer
     //!
     //! \brief Set the axis along which softmax is computed. Currently, only one axis can be set.
     //!
-    //! The axis is specified by setting the bit corresponding to the axis, after excluding the batch dimension, to 1.
+    //! The axis is specified by setting the bit corresponding to the axis to 1.
     //! Let's say we have an NCHW tensor as input (three non-batch dimensions).
+    //!
+    //! In implicit mode :
     //! Bit 0 corresponds to the C dimension boolean.
     //! Bit 1 corresponds to the H dimension boolean.
     //! Bit 2 corresponds to the W dimension boolean.
-    //! For example, to perform softmax on axis R of a NPQRCHW input, set bit 2.
-    //!
-    //! By default, softmax is performed on the axis which is the number of non-batch axes minus three. It is 0 if there are fewer than 3 non-batch axes.
-    //! For example, if the input is NCHW, the default axis is C. If the input is NHW, then the default axis is H.
+    //! By default, softmax is performed on the axis which is the number of axes minus three. It is 0 if
+    //! there are fewer than 3 non-batch axes. For example, if the input is NCHW, the default axis is C. If the input
+    //! is NHW, then the default axis is H.
+    //!
+    //! In explicit mode :
+    //! Bit 0 corresponds to the N dimension boolean.
+    //! Bit 1 corresponds to the C dimension boolean.
+    //! Bit 2 corresponds to the H dimension boolean.
+    //! Bit 3 corresponds to the W dimension boolean.
+    //! By default, softmax is performed on the axis which is the number of axes minus three. It is 0 if
+    //! there are fewer than 3 axes. For example, if the input is NCHW, the default axis is C. If the input
+    //! is NHW, then the default axis is N.
+    //!
+    //! For example, to perform softmax on axis R of a NPQRCHW input, set bit 2 with implicit batch mode,
+    //! set bit 3 with explicit batch mode.
     //!
     //! \param axes The axis along which softmax is computed.
+    //!        Here axes is a bitmap. For example, when doing softmax along axis 0, bit 0 is set to 1, axes = 1 << axis = 1.
     //!
-    virtual void setAxes(uint32_t axes) = 0;
+    virtual void setAxes(uint32_t axes) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the axis along which softmax occurs.
     //!
     //! \see setAxes()
     //!
-    virtual uint32_t getAxes() const = 0;
+    virtual uint32_t getAxes() const TRTNOEXCEPT = 0;
 };
 
 //!
@@ -1776,19 +1832,19 @@ class IConcatenationLayer : public ILayer
     //!
     //! \brief Set the axis along which concatenation occurs.
     //!
-    //! 0 is the major axis (excluding the batch dimension). The default is the number of non-batch axes in the tensor minus three (e.g.
-    //! for an NCHW input it would be 0), or 0 if there are fewer than 3 non-batch axes.
+    //! 0 is the major axis (excluding the batch dimension). The default is the number of non-batch axes in the tensor
+    //! minus three (e.g. for an NCHW input it would be 0), or 0 if there are fewer than 3 non-batch axes.
     //!
     //! \param axis The axis along which concatenation occurs.
     //!
-    virtual void setAxis(int axis) = 0;
+    virtual void setAxis(int axis) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the axis along which concatenation occurs.
     //!
     //! \see setAxis()
     //!
-    virtual int getAxis() const = 0;
+    virtual int getAxis() const TRTNOEXCEPT = 0;
 };
 
 //!
@@ -1810,14 +1866,14 @@ class IDeconvolutionLayer : public ILayer
     //!
     //! \see getKernelSize()
     //!
-    virtual void setKernelSize(DimsHW kernelSize) = 0;
+    virtual void setKernelSize(DimsHW kernelSize) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the HW kernel size of the deconvolution.
     //!
     //! \see setKernelSize()
     //!
-    virtual DimsHW getKernelSize() const = 0;
+    virtual DimsHW getKernelSize() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the number of output feature maps for the deconvolution.
@@ -1826,14 +1882,14 @@ class IDeconvolutionLayer : public ILayer
     //!
     //! \see getNbOutputMaps()
     //!
-    virtual void setNbOutputMaps(int nbOutputMaps) = 0;
+    virtual void setNbOutputMaps(int nbOutputMaps) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the number of output feature maps for the deconvolution.
     //!
     //! \see setNbOutputMaps()
     //!
-    virtual int getNbOutputMaps() const = 0;
+    virtual int getNbOutputMaps() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the stride of the deconvolution.
@@ -1842,19 +1898,20 @@ class IDeconvolutionLayer : public ILayer
     //!
     //! \see setStride()
     //!
-    virtual void setStride(DimsHW stride) = 0;
+    virtual void setStride(DimsHW stride) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the stride of the deconvolution.
     //!
     //! Default: (1,1)
     //!
-    virtual DimsHW getStride() const = 0;
+    virtual DimsHW getStride() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the padding of the deconvolution.
     //!
-    //! The output will be trimmed by this number of elements on each side in the height and width directions. In other words, it resembles the inverse of a convolution layer with this padding size.
+    //! The output will be trimmed by this number of elements on each side in the height and width directions.
+    //! In other words, it resembles the inverse of a convolution layer with this padding size.
     //! Padding is symmetric, and negative padding is not supported.
     //!
     //! Default: (0,0)
@@ -1863,70 +1920,73 @@ class IDeconvolutionLayer : public ILayer
     //!
     //! \see getPadding()
     //!
-    virtual void setPadding(DimsHW padding) = 0;
+    virtual void setPadding(DimsHW padding) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the padding of the deconvolution.
     //!
     //! \see setPadding()
     //!
-    virtual DimsHW getPadding() const = 0; // padding defaults to 0
+    virtual DimsHW getPadding() const TRTNOEXCEPT = 0; // padding defaults to 0
 
     //!
     //! \brief Set the number of groups for a deconvolution.
     //!
-    //! The input tensor channels are divided into \p nbGroups groups, and a deconvolution is executed for each group, using a filter per group. The results of the group
-    //! convolutions are concatenated to form the output.
+    //! The input tensor channels are divided into \p nbGroups groups, and a deconvolution is executed for each group,
+    //! using a filter per group. The results of the group convolutions are concatenated to form the output.
     //!
-    //! \note When using groups in int8 mode, the size of the groups (i.e. the channel count divided by the group count) must be a multiple of 4 for both input and output.
+    //! \note When using groups in int8 mode, the size of the groups (i.e. the channel count divided by the group count)
+    //! must be a multiple of 4 for both input and output.
     //!
     //! Default: 1
     //!
     //! \see getNbGroups()
     //!
-    virtual void setNbGroups(int nbGroups) = 0;
+    virtual void setNbGroups(int nbGroups) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the number of groups for a deconvolution.
     //!
     //! \see setNbGroups()
     //!
-    virtual int getNbGroups() const = 0;
+    virtual int getNbGroups() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the kernel weights for the deconvolution.
     //!
     //! The weights are specified as a contiguous array in \p CKRS order, where \p C the number of
-    //! input channels, \p K the number of output feature maps, and \p R and \p S are the height and width of the filter.
+    //! input channels, \p K the number of output feature maps, and \p R and \p S are the height and width
+    //! of the filter.
     //!
     //! \see getWeights()
     //!
-    virtual void setKernelWeights(Weights weights) = 0;
+    virtual void setKernelWeights(Weights weights) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the kernel weights for the deconvolution.
     //!
     //! \see setNbGroups()
     //!
-    virtual Weights getKernelWeights() const = 0;
+    virtual Weights getKernelWeights() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the bias weights for the deconvolution.
     //!
     //! Bias is optional. To omit bias, set the count value of the weights structure to zero.
     //!
-    //! The bias is applied per-feature-map, so the number of weights (if non-zero) must be equal to the number of output feature maps.
+    //! The bias is applied per-feature-map, so the number of weights (if non-zero) must be equal to the number of
+    //! output feature maps.
     //!
     //! \see getBiasWeights()
     //!
-    virtual void setBiasWeights(Weights weights) = 0;
+    virtual void setBiasWeights(Weights weights) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the bias weights for the deconvolution.
     //!
     //! \see getBiasWeights()
     //!
-    virtual Weights getBiasWeights() const = 0;
+    virtual Weights getBiasWeights() const TRTNOEXCEPT = 0;
 
 protected:
     virtual ~IDeconvolutionLayer() {}
@@ -1943,14 +2003,14 @@ class IDeconvolutionLayer : public ILayer
     //!
     //! \see getPadding()
     //!
-    virtual void setPrePadding(Dims padding) = 0;
+    virtual void setPrePadding(Dims padding) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the pre-padding.
     //!
     //! \see setPrePadding()
     //!
-    virtual Dims getPrePadding() const = 0;
+    virtual Dims getPrePadding() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the post-padding.
@@ -1963,24 +2023,24 @@ class IDeconvolutionLayer : public ILayer
     //!
     //! \see getPadding()
     //!
-    virtual void setPostPadding(Dims padding) = 0;
+    virtual void setPostPadding(Dims padding) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the padding.
     //!
     //! \see setPadding()
     //!
-    virtual Dims getPostPadding() const = 0;
+    virtual Dims getPostPadding() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the padding mode.
     //!
-    //! Padding mode gets precedence if both setPaddingMode and setPre/PostPadding are used.
+    //! Padding mode takes precedence if both setPaddingMode and setPre/PostPadding are used.
     //!
     //! Default: kEXPLICIT_ROUND_DOWN
     //!
     //! \see getPaddingMode()
-    virtual void setPaddingMode(PaddingMode paddingMode) = 0;
+    virtual void setPaddingMode(PaddingMode paddingMode) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the padding mode.
@@ -1988,7 +2048,64 @@ class IDeconvolutionLayer : public ILayer
     //! Default: kEXPLICIT_ROUND_DOWN
     //!
     //! \see setPaddingMode()
-    virtual PaddingMode getPaddingMode() const = 0;
+    virtual PaddingMode getPaddingMode() const TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Set the multi-dimension kernel size of the deconvolution.
+    //!
+    //! If executing this layer on DLA, only support 2D kernel size, both height and width of kernel size must be in the range [1,16].
+    //!
+    //! \see getKernelSizeNd() setKernelSize() getKernelSize()
+    //!
+    virtual void setKernelSizeNd(Dims kernelSize) TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Get the multi-dimension kernel size of the deconvolution.
+    //!
+    //! \see setKernelSizeNd()
+    //!
+    virtual Dims getKernelSizeNd() const TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Set the multi-dimension stride of the deconvolution.
+    //!
+    //! Default: (1, 1, ..., 1)
+    //!
+    //! If executing this layer on DLA, only support 2D stride, both height and width of stride must be in the range [1,8].
+    //!
+    //! \see getStrideNd() setStride() getStride()
+    //!
+    virtual void setStrideNd(Dims stride) TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Get the multi-dimension stride of the deconvolution.
+    //!
+    //! \see setStrideNd()
+    //!
+    virtual Dims getStrideNd() const TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Set the multi-dimension padding of the deconvolution.
+    //!
+    //! The input will be zero-padded by this number of elements in each dimension.
+    //! Padding is symmetric.
+    //!
+    //! Default: (0, 0, ..., 0)
+    //!
+    //! If executing this layer on DLA, only support 2D padding, both height and width of padding must be in the range [0,15].
+    //!
+    //! \see getPaddingNd() setPadding() getPadding()
+    //!
+    virtual void setPaddingNd(Dims padding) TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Get the multi-dimension padding of the deconvolution.
+    //!
+    //! If the padding is asymmetric, the pre-padding is returned.
+    //!
+    //! \see setPaddingNd()
+    //!
+    virtual Dims getPaddingNd() const TRTNOEXCEPT = 0;
 };
 
 //!
@@ -2000,19 +2117,20 @@ class IDeconvolutionLayer : public ILayer
 //!
 enum class ElementWiseOperation : int
 {
-    kSUM = 0,  //!< Sum of the two elements.
-    kPROD = 1, //!< Product of the two elements.
-    kMAX = 2,  //!< Maximum of the two elements.
-    kMIN = 3,  //!< Minimum of the two elements.
-    kSUB = 4,  //!< Substract the second element from the first.
-    kDIV = 5,  //!< Divide the first element by the second.
-    kPOW = 6   //!< The first element to the power of the second element.
+    kSUM = 0,      //!< Sum of the two elements.
+    kPROD = 1,     //!< Product of the two elements.
+    kMAX = 2,      //!< Maximum of the two elements.
+    kMIN = 3,      //!< Minimum of the two elements.
+    kSUB = 4,      //!< Substract the second element from the first.
+    kDIV = 5,      //!< Divide the first element by the second.
+    kPOW = 6,      //!< The first element to the power of the second element.
+    kFLOOR_DIV = 7 //!< Floor division of the first element by the second.
 };
 
 template <>
-inline int EnumMax<ElementWiseOperation>()
+constexpr inline int EnumMax<ElementWiseOperation>()
 {
-    return 7;
+    return 8;
 } //!< Maximum number of elements in ElementWiseOperation enum. \see ElementWiseOperation
 
 //!
@@ -2038,7 +2156,7 @@ class IElementWiseLayer : public ILayer
     //!
     //! \see getBiasWeights()
     //!
-    virtual void setOperation(ElementWiseOperation type) = 0;
+    virtual void setOperation(ElementWiseOperation type) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the binary operation for the layer.
@@ -2047,7 +2165,7 @@ class IElementWiseLayer : public ILayer
     //!
     //! \see setBiasWeights()
     //!
-    virtual ElementWiseOperation getOperation() const = 0;
+    virtual ElementWiseOperation getOperation() const TRTNOEXCEPT = 0;
 
 protected:
     virtual ~IElementWiseLayer() {}
@@ -2060,19 +2178,34 @@ class IGatherLayer : public ILayer
 {
 public:
     //!
-    //! \brief Set the non-batch dimension axis to gather on.
-    //!  The axis must be less than the number of non-batch dimensions in the data input.
+    //! \brief Set the axis to gather on.
+    //!  The axis must be less than the number of dimensions in the data input.
     //!
     //! \see getGatherAxis()
     //!
-    virtual void setGatherAxis(int axis) = 0;
+    virtual void setGatherAxis(int axis) TRTNOEXCEPT = 0;
 
     //!
-    //! \brief Get the non-batch dimension axis to gather on.
+    //! \brief Get the axis to gather on.
     //!
     //! \see setGatherAxis()
     //!
-    virtual int getGatherAxis() const = 0;
+    virtual int getGatherAxis() const TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Set the number of leading dimensions of indices tensor to be handled elementwise.
+    //! k must be 0 if there is an implicit batch dimension.  It can be 0 or 1 if there is not an implicit batch dimension.
+    //!
+    //! \see getNbElementWiseDims()
+    //!
+    virtual void setNbElementWiseDims(int k) TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Get the number of leading dimensions of indices tensor to be handled elementwise.
+    //!
+    //! \see setNbElementWiseDims()
+    //!
+    virtual int getNbElementWiseDims() const TRTNOEXCEPT = 0;
 
 protected:
     virtual ~IGatherLayer() {}
@@ -2166,7 +2299,7 @@ enum class RNNOperation : int
 };
 
 template <>
-inline int EnumMax<RNNOperation>()
+constexpr inline int EnumMax<RNNOperation>()
 {
     return 4;
 } //!< Maximum number of elements in RNNOperation enum. \see RNNOperation
@@ -2185,7 +2318,7 @@ enum class RNNDirection : int
 };
 
 template <>
-inline int EnumMax<RNNDirection>()
+constexpr inline int EnumMax<RNNDirection>()
 {
     return 2;
 } //!< Maximum number of elements in RNNDirection enum. \see RNNDirection
@@ -2212,7 +2345,7 @@ enum class RNNInputMode : int
 };
 
 template <>
-inline int EnumMax<RNNInputMode>()
+constexpr inline int EnumMax<RNNInputMode>()
 {
     return 2;
 } //!< Maximum number of elements in RNNInputMode enum. \see RNNInputMode
@@ -2228,7 +2361,7 @@ inline int EnumMax<RNNInputMode>()
 //!
 //! \warning Do not inherit from this class, as doing so will break forward-compatibility of the API and ABI.
 //!
-class IRNNLayer : public ILayer
+class TRT_DEPRECATED IRNNLayer : public ILayer
 {
 public:
     //!
@@ -2236,7 +2369,7 @@ class IRNNLayer : public ILayer
     //!
     //! \return The number of layers in the RNN.
     //!
-    virtual unsigned getLayerCount() const = 0;
+    virtual unsigned getLayerCount() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the size of the hidden layers.
@@ -2246,7 +2379,7 @@ class IRNNLayer : public ILayer
     //! \return The internal hidden layer size for the RNN.
     //! \see getDirection(), addRNN()
     //!
-    virtual std::size_t getHiddenSize() const = 0;
+    virtual std::size_t getHiddenSize() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the sequence length.
@@ -2256,35 +2389,35 @@ class IRNNLayer : public ILayer
     //!
     //! \return the maximum number of time steps that can be executed by a single call RNN layer.
     //!
-    virtual int getSeqLength() const = 0;
+    virtual int getSeqLength() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the operation of the RNN layer.
     //!
     //! \see getOperation(), RNNOperation
     //!
-    virtual void setOperation(RNNOperation op) = 0;
+    virtual void setOperation(RNNOperation op) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the operation of the RNN layer.
     //!
     //! \see setOperation(), RNNOperation
     //!
-    virtual RNNOperation getOperation() const = 0;
+    virtual RNNOperation getOperation() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the operation of the RNN layer.
     //!
     //! \see getInputMode(), RNNInputMode
     //!
-    virtual void setInputMode(RNNInputMode op) = 0;
+    virtual void setInputMode(RNNInputMode op) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the operation of the RNN layer.
     //!
     //! \see setInputMode(), RNNInputMode
     //!
-    virtual RNNInputMode getInputMode() const = 0;
+    virtual RNNInputMode getInputMode() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the direction of the RNN layer.
@@ -2297,14 +2430,14 @@ class IRNNLayer : public ILayer
     //! in output size of 2x getHiddenSize().
     //! \see getDirection(), RNNDirection
     //!
-    virtual void setDirection(RNNDirection op) = 0;
+    virtual void setDirection(RNNDirection op) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the direction of the RNN layer.
     //!
     //! \see setDirection(), RNNDirection
     //!
-    virtual RNNDirection getDirection() const = 0;
+    virtual RNNDirection getDirection() const TRTNOEXCEPT = 0;
 
     //!
     //! \param weights The weight structure holding the weight parameters.
@@ -2420,14 +2553,14 @@ class IRNNLayer : public ILayer
     //!
     //! \see getWeights(), #RNNOperation
     //!
-    virtual void setWeights(Weights weights) = 0;
+    virtual void setWeights(Weights weights) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the W weights for the RNN.
     //!
     //! \see setWeights()
     //!
-    virtual Weights getWeights() const = 0;
+    virtual Weights getWeights() const TRTNOEXCEPT = 0;
 
     //!
     //! \param bias The weight structure holding the bias parameters.
@@ -2480,14 +2613,14 @@ class IRNNLayer : public ILayer
     //!
     //! \see getBias(), #RNNOperation
     //!
-    virtual void setBias(Weights bias) = 0;
+    virtual void setBias(Weights bias) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the bias parameter vector for the RNN.
     //!
     //! \see setBias()
     //!
-    virtual Weights getBias() const = 0;
+    virtual Weights getBias() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the length of the data being processed by the RNN for use in computing
@@ -2495,7 +2628,7 @@ class IRNNLayer : public ILayer
     //!
     //! \see setHiddenState(), setCellState()
     //!
-    virtual int getDataLength() const = 0;
+    virtual int getDataLength() const TRTNOEXCEPT = 0;
 
     //!
     //! \param hidden The initial hidden state of the RNN.
@@ -2507,20 +2640,21 @@ class IRNNLayer : public ILayer
     //!  - H - The number of mini-batches for each time sequence.
     //!  - W - The size of the per layer hidden states, it must match getHiddenSize().
     //!
-    //! If getDirection() is ::kBIDIRECTION, the amount of space required is doubled and C is equal to getLayerCount() * 2.
+    //! If getDirection() is ::kBIDIRECTION, the amount of space required is doubled and C is equal to
+    //! getLayerCount() * 2.
     //!
     //! If hidden is not specified, then the initial hidden state is set to zero.
     //!
     //! \see getHiddenState()
     //!
-    virtual void setHiddenState(ITensor& hidden) = 0;
+    virtual void setHiddenState(ITensor& hidden) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the initial hidden state of the RNN.
     //!
     //! \return nullptr if no initial hidden tensor was specified, the initial hidden data otherwise.
     //!
-    virtual ITensor* getHiddenState() const = 0;
+    virtual ITensor* getHiddenState() const TRTNOEXCEPT = 0;
 
     //!
     //! \param cell The initial cell state of the RNN.
@@ -2534,20 +2668,21 @@ class IRNNLayer : public ILayer
     //!
     //! If \p cell is not specified, then the initial cell state is set to zero.
     //!
-    //! If getDirection() is ::kBIDIRECTION, the amount of space required is doubled and C is equal to getLayerCount() * 2.
+    //! If getDirection() is ::kBIDIRECTION, the amount of space required is doubled and C is equal to
+    //! getLayerCount() * 2.
     //!
     //! The cell state only affects LSTM RNN's.
     //!
     //! \see getCellState()
     //!
-    virtual void setCellState(ITensor& cell) = 0;
+    virtual void setCellState(ITensor& cell) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the initial cell state of the RNN.
     //!
     //! \return nullptr if no initial cell tensor was specified, the initial cell data otherwise.
     //!
-    virtual ITensor* getCellState() const = 0;
+    virtual ITensor* getCellState() const TRTNOEXCEPT = 0;
 
 protected:
     virtual ~IRNNLayer() {}
@@ -2572,7 +2707,7 @@ enum class RNNGateType : int
 };
 
 template <>
-inline int EnumMax<RNNGateType>()
+constexpr inline int EnumMax<RNNGateType>()
 {
     return 7;
 } //!< Maximum number of elements in RNNGateType enum. \see RNNGateType
@@ -2589,10 +2724,10 @@ inline int EnumMax<RNNGateType>()
 class IRNNv2Layer : public ILayer
 {
 public:
-    virtual int32_t getLayerCount() const = 0;   //< Get the layer count of the RNN
-    virtual int32_t getHiddenSize() const = 0;   //< Get the hidden size of the RNN
-    virtual int32_t getMaxSeqLength() const = 0; //< Get the maximum sequence length of the RNN
-    virtual int32_t getDataLength() const = 0;   //< Get the maximum data length of the RNN
+    virtual int32_t getLayerCount() const TRTNOEXCEPT = 0;   //< Get the layer count of the RNN
+    virtual int32_t getHiddenSize() const TRTNOEXCEPT = 0;   //< Get the hidden size of the RNN
+    virtual int32_t getMaxSeqLength() const TRTNOEXCEPT = 0; //< Get the maximum sequence length of the RNN
+    virtual int32_t getDataLength() const TRTNOEXCEPT = 0;   //< Get the maximum data length of the RNN
 
     //!
     //! \brief Specify individual sequence lengths in the batch with the ITensor pointed to by
@@ -2608,7 +2743,7 @@ class IRNNv2Layer : public ILayer
     //!
     //! This tensor must be of type DataType::kINT32.
     //!
-    virtual void setSequenceLengths(ITensor& seqLengths) = 0;
+    virtual void setSequenceLengths(ITensor& seqLengths) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the sequence lengths specified for the RNN.
@@ -2617,43 +2752,43 @@ class IRNNv2Layer : public ILayer
     //!
     //! \see setSequenceLengths()
     //!
-    virtual ITensor* getSequenceLengths() const = 0;
+    virtual ITensor* getSequenceLengths() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the operation of the RNN layer.
     //! \see getOperation(), RNNOperation
     //!
-    virtual void setOperation(RNNOperation op) = 0;
+    virtual void setOperation(RNNOperation op) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the operation of the RNN layer.
     //! \see setOperation(), RNNOperation
     //!
-    virtual RNNOperation getOperation() const = 0;
+    virtual RNNOperation getOperation() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the input mode of the RNN layer.
     //! \see getInputMode(), RNNInputMode
     //!
-    virtual void setInputMode(RNNInputMode op) = 0;
+    virtual void setInputMode(RNNInputMode op) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the input mode of the RNN layer.
     //! \see setInputMode(), RNNInputMode
     //!
-    virtual RNNInputMode getInputMode() const = 0;
+    virtual RNNInputMode getInputMode() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the direction of the RNN layer.
     //! \see getDirection(), RNNDirection
     //!
-    virtual void setDirection(RNNDirection op) = 0;
+    virtual void setDirection(RNNDirection op) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the direction of the RNN layer.
     //! \see setDirection(), RNNDirection
     //!
-    virtual RNNDirection getDirection() const = 0;
+    virtual RNNDirection getDirection() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the weight parameters for an individual gate in the RNN.
@@ -2672,13 +2807,13 @@ class IRNNv2Layer : public ILayer
     //!        in IRNNLayer::setWeights() for documentation on the expected
     //!        dimensions of this matrix.
     //!
-    virtual void setWeightsForGate(int layerIndex, RNNGateType gate, bool isW, Weights weights) = 0;
+    virtual void setWeightsForGate(int layerIndex, RNNGateType gate, bool isW, Weights weights) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the weight parameters for an individual gate in the RNN.
     //! \see setWeightsForGate()
     //!
-    virtual Weights getWeightsForGate(int layerIndex, RNNGateType gate, bool isW) const = 0;
+    virtual Weights getWeightsForGate(int layerIndex, RNNGateType gate, bool isW) const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the bias parameters for an individual gate in the RNN.
@@ -2695,13 +2830,13 @@ class IRNNv2Layer : public ILayer
     //! \param bias The weight structure holding the bias parameters, which should be an
     //!        array of size getHiddenSize().
     //!
-    virtual void setBiasForGate(int layerIndex, RNNGateType gate, bool isW, Weights bias) = 0;
+    virtual void setBiasForGate(int layerIndex, RNNGateType gate, bool isW, Weights bias) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the bias parameters for an individual gate in the RNN.
     //! \see setBiasForGate()
     //!
-    virtual Weights getBiasForGate(int layerIndex, RNNGateType gate, bool isW) const = 0;
+    virtual Weights getBiasForGate(int layerIndex, RNNGateType gate, bool isW) const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the initial hidden state of the RNN with the provided \p hidden ITensor.
@@ -2715,13 +2850,13 @@ class IRNNv2Layer : public ILayer
     //!     final backward hidden state is stored in `L= 2*l + 1`.
     //!  - `H` is the hidden state for each layer, equal to getHiddenSize().
     //!
-    virtual void setHiddenState(ITensor& hidden) = 0;
+    virtual void setHiddenState(ITensor& hidden) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the initial hidden state of the RNN.
     //! \see setHiddenState()
     //!
-    virtual ITensor* getHiddenState() const = 0;
+    virtual ITensor* getHiddenState() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the initial cell state of the LSTM with the provided \p cell ITensor.
@@ -2737,13 +2872,13 @@ class IRNNv2Layer : public ILayer
     //!
     //! It is an error to call setCellState() on an RNN layer that is not configured with RNNOperation::kLSTM.
     //!
-    virtual void setCellState(ITensor& cell) = 0;
+    virtual void setCellState(ITensor& cell) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the initial cell state of the RNN.
     //! \see setCellState()
     //!
-    virtual ITensor* getCellState() const = 0;
+    virtual ITensor* getCellState() const TRTNOEXCEPT = 0;
 
 protected:
     virtual ~IRNNv2Layer() {}
@@ -2758,7 +2893,8 @@ class IOutputDimensionsFormula
 {
 public:
     //!
-    //! \brief Application-implemented interface to compute the HW output dimensions of a layer from the layer input and parameters.
+    //! \brief Application-implemented interface to compute the HW output dimensions of a layer from the layer input
+    //! and parameters.
     //!
     //! \param inputDims The input dimensions of the layer.
     //! \param kernelSize The kernel size (or window size, for a pooling layer) parameter of the layer operation.
@@ -2771,504 +2907,23 @@ class IOutputDimensionsFormula
     //!
     //! Note that for dilated convolutions, the dilation is applied to the kernel size before this routine is called.
     //!
-    virtual DimsHW compute(DimsHW inputDims, DimsHW kernelSize, DimsHW stride, DimsHW padding, DimsHW dilation, const char* layerName) const = 0;
+    virtual DimsHW compute(DimsHW inputDims, DimsHW kernelSize, DimsHW stride, DimsHW padding, DimsHW dilation, const char* layerName) const TRTNOEXCEPT = 0;
 
     virtual ~IOutputDimensionsFormula() {}
 };
 
 //!
-//! \enum PluginFormatType
-//!
-//! \brief Format of the input/output tensors.
-//!
-//! \see IPluginExt::getPluginFormats()
+//! \class IPluginLayer
 //!
-//! For more information about data formats, see the topic "Data Format Description" located in the
-//! TensorRT Developer Guide (https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html).
+//! \brief Layer type for plugins.
 //!
-enum class PluginFormat : uint8_t
-{
-    //! NCHW.
-    kNCHW = 0,
-
-    //! NCHW with 2-element packed channels.  For a tensor with dimensions {N, C, H, W},
-    //! the memory layout is equivalent to a C array with dimensions [N][(C+1)/2][H][W][2],
-    //! with the tensor coordinates (n,c,h,w) mapping to array subscript [n][c/2][h][w][c%2].
-    kNC2HW2 = 1,
-
-    //! NHWC where C must be a multiple of 8.
-    kNHWC8 = 2
-};
-
-template <>
-inline int EnumMax<PluginFormat>()
-{
-    return 3;
-} //!< Maximum number of elements in PluginFormat enum. \see PluginFormat
-
-//! \class IPlugin
+//! \see IPluginExt
 //!
-//! \brief Plugin class for user-implemented layers.
+//! \deprecated This interface is superseded by IPluginV2Layer
 //!
-//! Plugins are a mechanism for applications to implement custom layers. Each plugin is owned by the application, and its lifetime
-//! must span any use of it by TensorRT
+//! \warning Do not inherit from this class, as doing so will break forward-compatibility of the API and ABI.
 //!
-class IPlugin
-{
-public:
-    //!
-    //! \brief Get the number of outputs from the layer.
-    //!
-    //! \return The number of outputs.
-    //!
-    //! This function is called by the implementations of INetworkDefinition and IBuilder. In particular, it is called prior to any call to initialize().
-    //!
-    virtual int getNbOutputs() const = 0;
-
-    //!
-    //! \brief Get the dimension of an output tensor.
-    //!
-    //! \param index The index of the output tensor.
-    //! \param inputs The input tensors.
-    //! \param nbInputDims The number of input tensors.
-    //!
-    //! This function is called by the implementations of INetworkDefinition and IBuilder. In particular, it is called prior to any call to initialize().
-    //!
-    virtual Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) = 0;
-
-    //!
-    //! \brief Configure the layer.
-    //!
-    //! This function is called by the builder prior to initialize(). It provides an opportunity for the layer to make algorithm choices on the basis
-    //! of its weights, dimensions, and maximum batch size. The type is assumed to be FP32 and format NCHW.
-    //!
-    //! \param inputDims The input tensor dimensions.
-    //! \param nbInputs The number of inputs.
-    //! \param outputDims The output tensor dimensions.
-    //! \param nbOutputs The number of outputs.
-    //! \param maxBatchSize The maximum batch size.
-    //!
-    //! The dimensions passed here do not include the outermost batch size (i.e. for 2-D image networks, they will be 3-dimensional CHW dimensions).
-    //!
-    //! This method is not called for PluginExt classes; configureWithFormat is called instead.
-    //!
-    virtual void configure(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, int maxBatchSize) = 0;
-
-    //!
-    //! \brief Initialize the layer for execution. This is called when the engine is created.
-    //!
-    //! \return 0 for success, else non-zero (which will cause engine termination).
-    //!
-    virtual int initialize() = 0;
-
-    //!
-    //! \brief Release resources acquired during plugin layer initialization. This is called when the engine is destroyed.
-    //! \see initialize()
-    //!
-    virtual void terminate() = 0;
-
-    //!
-    //! \brief Find the workspace size required by the layer.
-    //!
-    //! This function is called during engine startup, after initialize(). The workspace size returned should be sufficient for any
-    //! batch size up to the maximum.
-    //!
-    //! \return The workspace size.
-    //!
-    virtual size_t getWorkspaceSize(int maxBatchSize) const = 0;
-
-    //!
-    //! \brief Execute the layer.
-    //!
-    //! \param batchSize The number of inputs in the batch.
-    //! \param inputs The memory for the input tensors.
-    //! \param outputs The memory for the output tensors.
-    //! \param workspace Workspace for execution.
-    //! \param stream The stream in which to execute the kernels.
-    //!
-    //! \return 0 for success, else non-zero (which will cause engine termination).
-    //!
-    virtual int enqueue(int batchSize, const void* const* inputs, void** outputs, void* workspace, cudaStream_t stream) = 0;
-
-    //!
-    //! \brief Find the size of the serialization buffer required.
-    //!
-    //! \return The size of the serialization buffer.
-    //!
-    virtual size_t getSerializationSize() = 0;
-
-    //!
-    //! \brief Serialize the layer.
-    //!
-    //! \param buffer A pointer to a buffer of size at least that returned by getSerializationSize().
-    //!
-    //! \see getSerializationSize()
-    //!
-    virtual void serialize(void* buffer) = 0;
-
-    virtual ~IPlugin() {}
-};
-
-//!
-//! \class IPluginExt
-//!
-//! \brief Plugin class for user-implemented layers.
-//!
-//! Plugins are a mechanism for applications to implement custom layers. Each plugin is owned by the application, and its lifetime
-//! must span any use of it by TensorRT.
-//!
-class IPluginExt : public IPlugin
-{
-public:
-    //!
-    //! \brief Return the API version with which this plugin was built.
-    //!
-    //! Do not override this method as it is used by the TensorRT library to maintain backwards-compatibility with plugins.
-    //!
-    virtual int getTensorRTVersion() const
-    {
-        return NV_TENSORRT_VERSION;
-    }
-
-    //!
-    //! \brief Check format support.
-    //!
-    //! \param type DataType requested.
-    //! \param format PluginFormat requested.
-    //! \return true if the plugin supports the type-format combination.
-    //!
-    //! This function is called by the implementations of INetworkDefinition, IBuilder, and ICudaEngine.
-    //! In particular, it is called when creating an engine and when deserializing an engine.
-    //!
-    virtual bool supportsFormat(DataType type, PluginFormat format) const = 0;
-
-    //!
-    //! \brief Configure the layer.
-    //!
-    //! This function is called by the builder prior to initialize(). It provides an opportunity for the layer to make algorithm choices on the basis
-    //! of its weights, dimensions, and maximum batch size.
-    //!
-    //! \param inputDims The input tensor dimensions.
-    //! \param nbInputs The number of inputs.
-    //! \param outputDims The output tensor dimensions.
-    //! \param nbOutputs The number of outputs.
-    //! \param type The data type selected for the engine.
-    //! \param format The format selected for the engine.
-    //! \param maxBatchSize The maximum batch size.
-    //!
-    //! The dimensions passed here do not include the outermost batch size (i.e. for 2-D image networks, they will be 3-dimensional CHW dimensions).
-    //!
-    virtual void configureWithFormat(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, DataType type, PluginFormat format, int maxBatchSize) = 0;
-
-    virtual ~IPluginExt() {}
-
-protected:
-    //!
-    //! \brief Derived classes should not implement this. In a C++11 API it would be override final.
-    //!
-    void configure(const Dims* /*inputDims*/, int /*nbInputs*/, const Dims* /*outputDims*/, int /*nbOutputs*/, int /*maxBatchSize*/) _TENSORRT_FINAL {}
-};
-
-//! \class IPluginV2
-//!
-//! \brief Plugin class for user-implemented layers.
-//!
-//! Plugins are a mechanism for applications to implement custom layers. When
-//! combined with IPluginCreator it provides a mechanism to register plugins and
-//! look up the Plugin Registry during de-serialization.
-//!
-//! \see IPluginCreator
-//! \see IPluginRegistry
-//!
-class IPluginV2
-{
-public:
-    //!
-    //! \brief Return the API version with which this plugin was built.
-    //!
-    //! Do not override this method as it is used by the TensorRT library to maintain backwards-compatibility with plugins.
-    //!
-    virtual int getTensorRTVersion() const
-    {
-        return NV_TENSORRT_VERSION;
-    }
-
-    //!
-    //! \brief Return the plugin type. Should match the plugin name returned by the corresponding plugin creator
-    // \see IPluginCreator::getPluginName()
-    //!
-    virtual const char* getPluginType() const = 0;
-
-    //!
-    //! \brief Return the plugin version. Should match the plugin version returned by the corresponding plugin creator
-    // \see IPluginCreator::getPluginVersion()
-    //!
-    virtual const char* getPluginVersion() const = 0;
-
-    //!
-    //! \brief Get the number of outputs from the layer.
-    //!
-    //! \return The number of outputs.
-    //!
-    //! This function is called by the implementations of INetworkDefinition and IBuilder. In particular, it is called prior to any call to initialize().
-    //!
-    virtual int getNbOutputs() const = 0;
-
-    //!
-    //! \brief Get the dimension of an output tensor.
-    //!
-    //! \param index The index of the output tensor.
-    //! \param inputs The input tensors.
-    //! \param nbInputDims The number of input tensors.
-    //!
-    //! This function is called by the implementations of INetworkDefinition and IBuilder. In particular, it is called prior to any call to initialize().
-    //!
-    virtual Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) = 0;
-
-    //!
-    //! \brief Check format support.
-    //!
-    //! \param type DataType requested.
-    //! \param format PluginFormat requested.
-    //! \return true if the plugin supports the type-format combination.
-    //!
-    //! This function is called by the implementations of INetworkDefinition, IBuilder, and ICudaEngine.
-    //! In particular, it is called when creating an engine and when deserializing an engine.
-    //!
-    virtual bool supportsFormat(DataType type, PluginFormat format) const = 0;
-
-    //!
-    //! \brief Configure the layer.
-    //!
-    //! This function is called by the builder prior to initialize(). It provides an opportunity for the layer to make algorithm choices on the basis
-    //! of its weights, dimensions, and maximum batch size.
-    //!
-    //! \param inputDims The input tensor dimensions.
-    //! \param nbInputs The number of inputs.
-    //! \param outputDims The output tensor dimensions.
-    //! \param nbOutputs The number of outputs.
-    //! \param type The data type selected for the engine.
-    //! \param format The format selected for the engine.
-    //! \param maxBatchSize The maximum batch size.
-    //!
-    //! The dimensions passed here do not include the outermost batch size (i.e. for 2-D image networks, they will be 3-dimensional CHW dimensions).
-    //!
-    virtual void configureWithFormat(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, DataType type, PluginFormat format, int maxBatchSize) = 0;
-
-    //!
-    //! \brief Initialize the layer for execution. This is called when the engine is created.
-    //!
-    //! \return 0 for success, else non-zero (which will cause engine termination).
-    //!
-    virtual int initialize() = 0;
-
-    //!
-    //! \brief Release resources acquired during plugin layer initialization. This is called when the engine is destroyed.
-    //! \see initialize()
-    //!
-    virtual void terminate() = 0;
-
-    //!
-    //! \brief Find the workspace size required by the layer.
-    //!
-    //! This function is called during engine startup, after initialize(). The workspace size returned should be sufficient for any
-    //! batch size up to the maximum.
-    //!
-    //! \return The workspace size.
-    //!
-    virtual size_t getWorkspaceSize(int maxBatchSize) const = 0;
-
-    //!
-    //! \brief Execute the layer.
-    //!
-    //! \param batchSize The number of inputs in the batch.
-    //! \param inputs The memory for the input tensors.
-    //! \param outputs The memory for the output tensors.
-    //! \param workspace Workspace for execution.
-    //! \param stream The stream in which to execute the kernels.
-    //!
-    //! \return 0 for success, else non-zero (which will cause engine termination).
-    //!
-    virtual int enqueue(int batchSize, const void* const* inputs, void** outputs, void* workspace, cudaStream_t stream) = 0;
-
-    //!
-    //! \brief Find the size of the serialization buffer required.
-    //!
-    //! \return The size of the serialization buffer.
-    //!
-    virtual size_t getSerializationSize() const = 0;
-
-    //!
-    //! \brief Serialize the layer.
-    //!
-    //! \param buffer A pointer to a buffer to serialize data. Size of buffer must be equal to value returned by getSerializationSize.
-    //!
-    //! \see getSerializationSize()
-    //!
-    virtual void serialize(void* buffer) const = 0;
-
-    //!
-    //! \brief Destroy the plugin object. This will be called when the network, builder or engine is destroyed.
-    //!
-    virtual void destroy() = 0;
-
-    //!
-    //! \brief Clone the plugin object. This copies over internal plugin parameters and returns a new plugin object with these parameters.
-    //!
-    virtual IPluginV2* clone() const = 0;
-
-    //!
-    //! \brief Set the namespace that this plugin object belongs to. Ideally, all plugin
-    //! objects from the same plugin library should have the same namespace.
-    //!
-    virtual void setPluginNamespace(const char* pluginNamespace) = 0;
-
-    //!
-    //! \brief Return the namespace of the plugin object.
-    //!
-    virtual const char* getPluginNamespace() const = 0;
-
-protected:
-    virtual ~IPluginV2() {}
-};
-
-class IGpuAllocator;
-
-//! \class IPluginV2Ext
-//!
-//! \brief Plugin class for user-implemented layers.
-//!
-//! Plugins are a mechanism for applications to implement custom layers. This
-//! interface provides additional capabilities to the IPluginV2 interface by
-//! supporting different output data types and broadcast across batch.
-//!
-//! \see IPluginV2
-//!
-class IPluginV2Ext : public IPluginV2
-{
-public:
-    //!
-    //! \brief Return the DataType of the plugin output at the requested index.
-    //! The default behavior should be to return the type of the first input, or DataType::kFLOAT if the layer has no inputs.
-    //! The returned data type must have a format that is supported by the plugin.
-    //! \see supportsFormat()
-    //!
-    virtual nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const = 0;
-
-    //! \brief Return true if output tensor is broadcast across a batch.
-    //!
-    //! \param outputIndex The index of the output
-    //! \param inputIsBroadcasted The ith element is true if the tensor for the ith input is broadcast across a batch.
-    //! \param nbInputs The number of inputs
-    //!
-    //! The values in inputIsBroadcasted refer to broadcasting at the semantic level,
-    //! i.e. are unaffected by whether method canBroadcastInputAcrossBatch requests
-    //! physical replication of the values.
-    //!
-    virtual bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const = 0;
-
-    //! \brief Return true if plugin can use input that is broadcast across batch without replication.
-    //!
-    //! \param inputIndex Index of input that could be broadcast.
-    //!
-    //! For each input whose tensor is semantically broadcast across a batch,
-    //! TensorRT calls this method before calling configurePlugin.
-    //! If canBroadcastInputAcrossBatch returns true, TensorRT will not replicate the input tensor;
-    //! i.e., there will be a single copy that the plugin should share across the batch.
-    //! If it returns false, TensorRT will replicate the input tensor
-    //! so that it appears like a non-broadcasted tensor.
-    //!
-    //! This method is called only for inputs that can be broadcast.
-    //!
-    virtual bool canBroadcastInputAcrossBatch(int inputIndex) const = 0;
-
-    //!
-    //! \brief Configure the layer with input and output data types.
-    //!
-    //! This function is called by the builder prior to initialize(). It provides an opportunity for the layer to make algorithm choices on the basis
-    //! of its weights, dimensions, data types and maximum batch size.
-    //!
-    //! \param inputDims The input tensor dimensions.
-    //! \param nbInputs The number of inputs.
-    //! \param outputDims The output tensor dimensions.
-    //! \param nbOutputs The number of outputs.
-    //! \param inputTypes The data types selected for the plugin inputs.
-    //! \param outputTypes The data types selected for the plugin outputs.
-    //! \param inputIsBroadcast True for each input that the plugin must broadcast across the batch.
-    //! \param outputIsBroadcast True for each output that TensorRT will broadcast across the batch.
-    //! \param floatFormat The format selected for the engine for the floating point
-    //!  inputs/outputs.
-    //! \param maxBatchSize The maximum batch size.
-    //!
-    //! The dimensions passed here do not include the outermost batch size (i.e. for 2-D image networks, they will be 3-dimensional CHW dimensions).
-    //! When inputIsBroadcast or outputIsBroadcast is true, the outermost batch size for that input or output should be treated as if it is one.
-    //! \ref inputIsBroadcast[i] is true only if the input is semantically broadcast across the batch and \ref canBroadcastInputAcrossBatch(i) returned true.
-    //! \ref outputIsBroadcast[i] is true only if \ref isOutputBroadcastAcrossBatch(i) returned true.
-
-    virtual void configurePlugin(const Dims* inputDims, int nbInputs, const Dims* outputDims,
-                                 int nbOutputs, const DataType* inputTypes, const DataType* outputTypes,
-                                 const bool* inputIsBroadcast, const bool* outputIsBroadcast, PluginFormat floatFormat, int maxBatchSize)
-        = 0;
-
-    virtual ~IPluginV2Ext() {}
-
-    //!
-    //! \brief Attach the plugin object to an execution context and grant the plugin the access to some context resource.
-    //!
-    //! \param cudnn The cudnn context handle of the execution context
-    //! \param cublas The cublas context handle of the execution context
-    //! \param allocator The allocator used by the execution context
-    //!
-    //! This function is called automatically for each plugin when a new execution context is created.
-    //! If the plugin needs per-context resource, it can be allocated here.
-    //! The plugin can also get context-owned CUDNN and CUBLAS context here.
-    //!
-    virtual void attachToContext(cudnnContext* /*cudnn*/, cublasContext* /*cublas*/, IGpuAllocator* /*allocator*/) {}
-
-    //!
-    //! \brief Detach the plugin object from its execution context.
-    //!
-    //! This function is called automatically for each plugin when a execution context is destroyed.
-    //! If the plugin owns per-context resource, it can be released here.
-    //!
-    virtual void detachFromContext() {}
-
-    //!
-    //! \brief Clone the plugin object. This copies over internal plugin parameters as well and returns a new plugin object with these parameters.
-    //! If the source plugin is pre-configured with configurePlugin(), the returned object should also be pre-configured. The returned object should allow attachToContext() with a new execution context.
-    //! Cloned plugin objects can share the same per-engine immutable resource (e.g. weights) with the source object (e.g. via ref-counting) to avoid duplication.
-    //!
-    virtual IPluginV2Ext* clone() const _TENSORRT_OVERRIDE = 0;
-
-protected:
-    //!
-    //! \brief Return the API version with which this plugin was built. The
-    //!  upper byte reserved by TensorRT and is used to differentiate this from IPlguinV2.
-    //!
-    //! Do not override this method as it is used by the TensorRT library to maintain backwards-compatibility with plugins.
-    //!
-    int getTensorRTVersion() const _TENSORRT_OVERRIDE
-    {
-        return (0x01000000 | (NV_TENSORRT_VERSION & 0xFFFFFF));
-    }
-
-    //!
-    //! \brief Derived classes should not implement this. In a C++11 API it would be override final.
-    //!
-    void configureWithFormat(const Dims* /*inputDims*/, int /*nbInputs*/, const Dims* /*outputDims*/,
-                             int /*nbOutputs*/, DataType /*type*/, PluginFormat /*format*/, int /*maxBatchSize*/) _TENSORRT_OVERRIDE _TENSORRT_FINAL {}
-};
-
-//!
-//! \class IPluginLayer
-//!
-//! \brief Layer type for plugins.
-//!
-//! \see IPluginExt
-//!
-//! \warning Do not inherit from this class, as doing so will break forward-compatibility of the API and ABI.
-//!
-class IPluginLayer : public ILayer
+class TRT_DEPRECATED IPluginLayer : public ILayer
 {
 public:
     //!
@@ -3276,7 +2931,7 @@ class IPluginLayer : public ILayer
     //!
     //! \see IPluginExt
     //!
-    virtual IPlugin& getPlugin() = 0;
+    virtual IPlugin& getPlugin() TRTNOEXCEPT = 0;
 
 protected:
     virtual ~IPluginLayer() {}
@@ -3299,170 +2954,12 @@ class IPluginV2Layer : public ILayer
     //!
     //! \see IPluginV2
     //!
-    virtual IPluginV2& getPlugin() = 0;
+    virtual IPluginV2& getPlugin() TRTNOEXCEPT = 0;
 
 protected:
     virtual ~IPluginV2Layer() {}
 };
 
-//!
-//! \enum FieldType
-//! \brief The possible field types for custom layer.
-//!
-
-enum class PluginFieldType : int
-{
-    kFLOAT16 = 0, //!< FP16 field type.
-    kFLOAT32 = 1, //!< FP32 field type.
-    kFLOAT64 = 2, //!< FP64 field type.
-    kINT8 = 3,    //!< INT8 field type.
-    kINT16 = 4,   //!< INT16 field type.
-    kINT32 = 5,   //!< INT32 field type.
-    kCHAR = 6,    //!< char field type.
-    kDIMS = 7,    //!< nvinfer1::Dims field type.
-    kUNKNOWN = 8
-};
-
-//!
-//! \class PluginField
-//!
-//! \brief Structure containing plugin attribute field names and associated data
-//! This information can be parsed to decode necessary plugin metadata
-//!
-//!
-struct PluginField
-{
-    //!
-    //! \brief Plugin field attribute name
-    //!
-    const char* name;
-    //!
-    //! \brief Plugin field attribute data
-    //!
-    const void* data;
-    //!
-    //! \brief Plugin field attribute type
-    //! \see PluginFieldType
-    //!
-    PluginFieldType type;
-    //!
-    //! \brief Number of data entries in the Plugin attribute
-    //!
-    int length;
-
-    PluginField(const char* name_ = nullptr, const void* data_ = nullptr, const PluginFieldType type_ = PluginFieldType::kUNKNOWN, int length_ = 0)
-        : name(name_)
-        , data(data_)
-        , type(type_)
-        , length(length_)
-    {
-    }
-};
-
-struct PluginFieldCollection
-{
-    int nbFields;              //!< Number of PluginField entries
-    const PluginField* fields; //!< Pointer to PluginField entries
-};
-
-//!
-//! \class IPluginCreator
-//!
-//! \brief Plugin creator class for user implemented layers.
-//!
-//! \see IPlugin and IPluginFactory
-//!
-
-class IPluginCreator
-{
-public:
-    //!
-    //! \brief Return the version of the API the plugin creator was compiled with.
-    //!
-    virtual int getTensorRTVersion() const { return NV_TENSORRT_VERSION; }
-
-    //!
-    //! \brief Return the plugin name.
-    //!
-    virtual const char* getPluginName() const = 0;
-
-    //!
-    //! \brief Return the plugin version.
-    //!
-    virtual const char* getPluginVersion() const = 0;
-
-    //!
-    //! \brief Return a list of fields that needs to be passed to createPlugin.
-    //! \see PluginFieldCollection
-    //!
-    virtual const PluginFieldCollection* getFieldNames() = 0;
-
-    //!
-    //! \brief Return a plugin object. Return nullptr in case of error.
-    //!
-    virtual IPluginV2* createPlugin(const char* name, const PluginFieldCollection* fc) = 0;
-
-    //!
-    //! \brief Called during deserialization of plugin layer. Return a plugin object.
-    //!
-    virtual IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) = 0;
-
-    //!
-    //! \brief Set the namespace of the plugin creator based on the plugin
-    //! library it belongs to. This can be set while registering the plugin creator.
-    //!
-    //! \see IPluginRegistry::registerCreator()
-    //!
-    virtual void setPluginNamespace(const char* pluginNamespace) = 0;
-
-    //!
-    //! \brief Return the namespace of the plugin creator object.
-    //!
-    virtual const char* getPluginNamespace() const = 0;
-
-    virtual ~IPluginCreator() {}
-};
-
-//!
-//! \class IPluginRegistry
-//!
-//! \brief Single registration point for all plugins in an application. It is
-//! used to find plugin implementations during engine deserialization.
-//! Internally, the plugin registry is considered to be a singleton so all
-//! plugins in an application are part of the same global registry.
-//! Note that the plugin registry is only supported for plugins of type
-//! IPluginV2 and should also have a corresponding IPluginCreator implementation.
-//!
-//! \see IPluginV2 and IPluginCreator
-//!
-//! \warning Do not inherit from this class, as doing so will break forward-compatibility of the API and ABI.
-//!
-
-class IPluginRegistry
-{
-public:
-    //!
-    //! \brief Register a plugin creator. Returns false if one with same type
-    //! is already registered.
-    //!
-    virtual bool registerCreator(IPluginCreator& creator, const char* pluginNamespace) = 0;
-
-    //!
-    //! \brief Return all the registered plugin creators and the number of
-    //! registered plugin creators. Returns nullptr if none found.
-    //!
-    virtual IPluginCreator* const* getPluginCreatorList(int* numCreators) const = 0;
-
-    //!
-    //! \brief Return plugin creator based on plugin type, version and
-    //! namespace associated with plugin during network creation.
-    //!
-    virtual IPluginCreator* getPluginCreator(const char* pluginType, const char* pluginVersion, const char* pluginNamespace = "") = 0;
-
-protected:
-    virtual ~IPluginRegistry() {}
-};
-
 //!
 //! \enum UnaryOperation
 //!
@@ -3494,7 +2991,7 @@ enum class UnaryOperation : int
 };
 
 template <>
-inline int EnumMax<UnaryOperation>()
+constexpr inline int EnumMax<UnaryOperation>()
 {
     return 19;
 } //!< Maximum number of elements in UnaryOperation enum. \see UnaryOperation
@@ -3514,14 +3011,14 @@ class IUnaryLayer : public ILayer
     //!
     //! \see getOperation(), UnaryOperation
     //!
-    virtual void setOperation(UnaryOperation op) = 0;
+    virtual void setOperation(UnaryOperation op) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the unary operation for the layer.
     //!
     //! \see setOperation(), UnaryOperation
     //!
-    virtual UnaryOperation getOperation() const = 0;
+    virtual UnaryOperation getOperation() const TRTNOEXCEPT = 0;
 
 protected:
     virtual ~IUnaryLayer() {}
@@ -3542,7 +3039,7 @@ enum class ReduceOperation : int
 };
 
 template <>
-inline int EnumMax<ReduceOperation>()
+constexpr inline int EnumMax<ReduceOperation>()
 {
     return 5;
 } //!< Maximum number of elements in ReduceOperation enum. \see ReduceOperation
@@ -3562,42 +3059,42 @@ class IReduceLayer : public ILayer
     //!
     //! \see getOperation(), ReduceOperation
     //!
-    virtual void setOperation(ReduceOperation op) = 0;
+    virtual void setOperation(ReduceOperation op) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the reduce operation for the layer.
     //!
     //! \see setOperation(), ReduceOperation
     //!
-    virtual ReduceOperation getOperation() const = 0;
+    virtual ReduceOperation getOperation() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the axes over which to reduce.
     //!
     //! \see getReduceAxes
     //!
-    virtual void setReduceAxes(uint32_t reduceAxes) = 0;
+    virtual void setReduceAxes(uint32_t reduceAxes) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the axes over which to reduce for the layer.
     //!
     //! \see setReduceAxes
     //!
-    virtual uint32_t getReduceAxes() const = 0;
+    virtual uint32_t getReduceAxes() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the boolean that specifies whether or not to keep the reduced dimensions for the layer.
     //!
     //! \see getKeepDimensions
     //!
-    virtual void setKeepDimensions(bool keepDimensions) = 0;
+    virtual void setKeepDimensions(bool keepDimensions) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the boolean that specifies whether or not to keep the reduced dimensions for the layer.
     //!
     //! \see setKeepDimensions
     //!
-    virtual bool getKeepDimensions() const = 0;
+    virtual bool getKeepDimensions() const TRTNOEXCEPT = 0;
 
 protected:
     virtual ~IReduceLayer() {}
@@ -3608,8 +3105,8 @@ class IReduceLayer : public ILayer
 //!
 //! \brief Layer that represents a padding operation.
 //!
-//! The padding layer adds zero-padding at the start and end of the input tensor. It only supports padding along the two innermost dimensions.
-//! Applying negative padding results in cropping of the input.
+//! The padding layer adds zero-padding at the start and end of the input tensor. It only supports padding along the two
+//! innermost dimensions. Applying negative padding results in cropping of the input.
 //!
 //! \warning Do not inherit from this class, as doing so will break forward-compatibility of the API and ABI.
 //!
@@ -3623,14 +3120,14 @@ class IPaddingLayer : public ILayer
     //!
     //! \see getPrePadding
     //!
-    virtual void setPrePadding(DimsHW padding) = 0;
+    virtual void setPrePadding(DimsHW padding) TRTNOEXCEPT = 0;
 
     //!
-    //! \brief Set the padding that is applied at the start of the tensor.
+    //! \brief Get the padding that is applied at the start of the tensor.
     //!
     //! \see setPrePadding
     //!
-    virtual DimsHW getPrePadding() const = 0;
+    virtual DimsHW getPrePadding() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the padding that is applied at the end of the tensor.
@@ -3639,14 +3136,14 @@ class IPaddingLayer : public ILayer
     //!
     //! \see getPostPadding
     //!
-    virtual void setPostPadding(DimsHW padding) = 0;
+    virtual void setPostPadding(DimsHW padding) TRTNOEXCEPT = 0;
 
     //!
-    //! \brief Set the padding that is applied at the end of the tensor.
+    //! \brief Get the padding that is applied at the end of the tensor.
     //!
     //! \see setPostPadding
     //!
-    virtual DimsHW getPostPadding() const = 0;
+    virtual DimsHW getPostPadding() const TRTNOEXCEPT = 0;
 
 protected:
     virtual ~IPaddingLayer() {}
@@ -3670,6 +3167,9 @@ struct Permutation
 //! This class shuffles data by applying in sequence: a transpose operation, a reshape operation
 //! and a second transpose operation. The dimension types of the output are those of the reshape dimension.
 //!
+//! The layer has an optional second input.  If present, it must be a 1D Int32 shape tensor,
+//! and the reshape dimensions are taken from it.
+//!
 //! \warning Do not inherit from this class, as doing so will break forward-compatibility of the API and ABI.
 //!
 class IShuffleLayer : public ILayer
@@ -3684,7 +3184,7 @@ class IShuffleLayer : public ILayer
     //!
     //! \see getFirstTranspose
     //!
-    virtual void setFirstTranspose(Permutation permutation) = 0;
+    virtual void setFirstTranspose(Permutation permutation) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the permutation applied by the first transpose operation.
@@ -3693,7 +3193,7 @@ class IShuffleLayer : public ILayer
     //!
     //! \see setFirstTranspose
     //!
-    virtual Permutation getFirstTranspose() const = 0;
+    virtual Permutation getFirstTranspose() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the reshaped dimensions.
@@ -3713,14 +3213,31 @@ class IShuffleLayer : public ILayer
     //!
     //! The product of the new dimensions must be equal to the product of the old.
     //!
-    virtual void setReshapeDimensions(Dims dimensions) = 0;
+    //! If there is a second input, i.e. reshape dimensions are dynamic,
+    //! calling setReshapeDimensions() is an error and does not update
+    //! the dimensions.
+    //!
+    virtual void setReshapeDimensions(Dims dimensions) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the reshaped dimensions.
     //!
     //! \return The reshaped dimensions.
     //!
-    virtual Dims getReshapeDimensions() const = 0;
+    //! If there is a second input, returns Dims with nbDims == -1.
+    //!
+    virtual Dims getReshapeDimensions() const TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Relaxes ILayer::setInput to allow appending a second input.
+    //!
+    //! Like ILayer::setInput, but additionally works if index==1, nbInputs()==1, and
+    //! there is no implicit batch dimension, in which case nbInputs() changes to 2.
+    //!
+    //! When there is a 2nd input, the reshapeDimensions are taken from it, overriding
+    //! the dimensions supplied by setReshapeDimensions.
+    //!
+    void setInput(int index, ITensor& tensor) _TENSORRT_OVERRIDE TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the permutation applied by the second transpose operation.
@@ -3734,7 +3251,7 @@ class IShuffleLayer : public ILayer
     //!
     //! \see getSecondTranspose
     //!
-    virtual void setSecondTranspose(Permutation permutation) = 0;
+    virtual void setSecondTranspose(Permutation permutation) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the permutation applied by the second transpose operation.
@@ -3743,12 +3260,33 @@ class IShuffleLayer : public ILayer
     //!
     //! \see setSecondTranspose
     //!
-    virtual Permutation getSecondTranspose() const = 0;
+    virtual Permutation getSecondTranspose() const TRTNOEXCEPT = 0;
 
 protected:
     virtual ~IShuffleLayer() {}
 };
 
+//!
+//! \brief Slices an input tensor into an output tensor based on the offset and strides.
+//!
+//! The slice layer has two variants, static and dynamic. Static slice specifies the start, size, and stride
+//! dimensions at layer create time via Dims and can use the get/set accessor functions of the ISliceLayer. Dynamic
+//! slice specifies the start and size dimensions at layer create time via ITensors and uses ILayer::setTensor to
+//! set the optional stride parameter after layer construction.
+//! An application can determine if the ISliceLayer is dynamic or static based on if there are 3 or 4 inputs(Dynamic)
+//! or 1 input(Static). When working on a shape tensor, a dynamic slace layer must have start, size, and stride
+//! specified at build time.
+//!
+//! The slice layer selects for each dimension a start location from within the input tensor, and given the
+//! specified stride, copies strided elements to the output tensor. Start, Size, and Stride shape tensors must be
+//! DataType::kINT32.
+//!
+//! For example using slice on a data tensor:
+//! input = {{0, 1}, {2, 3}, {4, 5}}
+//! start = {1, 0}
+//! size = {1, 2}
+//! stride = {1, 2}
+//! output = {1, 5}
 //!
 //! \warning Do not inherit from this class, as doing so will break forward-compatibility of the API and ABI.
 //!
@@ -3756,58 +3294,128 @@ class ISliceLayer : public ILayer
 {
 public:
     //!
-    //! \brief Set the start offset
+    //! \brief Set the start offset that the slice layer uses to create the output slice.
     //!
-    //! \param start The start offset
+    //! \param start The start offset to read data from the input tensor.
+    //!
+    //! If the SliceLayer is using dynamic inputs for the start parameter, calling setStart() results in an error
+    //! and does not update the dimensions.
     //!
     //! \see getStart
     //!
-    virtual void setStart(Dims start) = 0;
+    virtual void setStart(Dims start) TRTNOEXCEPT = 0;
 
     //!
-    //! \brief Get the start offset
+    //! \brief Get the start offset for the slice layer.
+    //!
+    //! \return The start offset, or an invalid Dims structure.
     //!
-    //! \return The start Offset
+    //! If the SliceLayer is using dynamic inputs for the start parameter, this function returns an invalid
+    //! Dims structure.
     //!
     //! \see setStart
-    virtual Dims getStart() const = 0;
+    //!
+    virtual Dims getStart() const TRTNOEXCEPT = 0;
 
     //!
-    //! \brief Set the output dimension
+    //! \brief Set the dimensions of the output slice.
     //!
-    //! \param size The output dimension
+    //! \param size The dimensions of the output slice.
+    //!
+    //! If the SliceLayer is using dynamic inputs for the size parameter, calling setSize() results in an error
+    //! and does not update the dimensions.
     //!
     //! \see getSize
-    virtual void setSize(Dims size) = 0;
+    //!
+    virtual void setSize(Dims size) TRTNOEXCEPT = 0;
 
     //!
-    //! \brief Get the output dimension
+    //! \brief Get dimensions of the output slice.
+    //!
+    //! \return The output dimension, or an invalid Dims structure.
     //!
-    //! \return The output dimesion
+    //! If the SliceLayer is using dynamic inputs for the size parameter, this function returns an invalid
+    //! Dims structure.
     //!
     //! \see setSize
-    virtual Dims getSize() const = 0;
+    //!
+    virtual Dims getSize() const TRTNOEXCEPT = 0;
 
     //!
-    //! \brief Set the slicing stride
+    //! \brief Set the stride for computing the output slice data.
     //!
-    //! \param stride The slicing stride
+    //! \param stride The dimensions of the stride to compute the values to store in the output slice.
+    //!
+    //! If the SliceLayer is using dynamic inputs for the stride parameter, calling setSlice() results in an error
+    //! and does not update the dimensions.
     //!
     //! \see getStride
-    virtual void setStride(Dims stride) = 0;
+    //!
+    virtual void setStride(Dims stride) TRTNOEXCEPT = 0;
 
     //!
-    //! \brief Get the slicing stride
+    //! \brief Get the stride for the output slice.
     //!
-    //! \return The slicing stride
+    //! \return The slicing stride, or an invalid Dims structure.
+    //!
+    //! If the SliceLayer is using dynamic inputs for the stride parameter, this function returns a invalid
+    //! Dims structure.
     //!
     //! \see setStride
-    virtual Dims getStride() const = 0;
+    //!
+    virtual Dims getStride() const TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief replace an input of this layer with a specific tensor.
+    //!
+    //! \param index the index of the input to modify.
+    //! \param tensor the new input tensor
+    //!
+    //! Sets the input tensor for the given index. The index must be 0 for a static slice layer.
+    //! A static slice layer is converted to a dynamic slice layer by calling setInput with an index > 0.
+    //! A dynamic slice layer cannot be converted back to a static slice layer.
+    //!
+    //! For a dynamic slice layer, the values 0-3 are valid. If an index > 0 is specified, all values between
+    //! index 0 and that index must be dynamic tensors. The values larger than index can use static dimensions.
+    //! For example, if an index of two is specified, the stride tensor can be set via setStride, but the start tensor
+    //! must be specified via setInput as both size and start are converted to dynamic tensors.
+    //! The indices in the dynamic case are as follows:
+    //!
+    //! Index | Description
+    //!   0   | Data or Shape tensor to be sliced.
+    //!   1   | The start tensor to begin slicing, N-dimensional for Data, and 1-D for Shape.
+    //!   2   | The size tensor of the resulting slice, N-dimensional for Data, and 1-D for Shape.
+    //!   3   | The stride of the slicing operation, N-dimensional for Data, and 1-D for Shape.
+    //!
+    //! If this function is called with a value greater than 0, then the function getNbInputs() changes
+    //! from returning 1 to index + 1. When converting from static to dynamic slice layer,
+    //! all unset tensors, between 1 and index + 1, are initialized to nullptr. It is an error to attempt to build
+    //! a network that has any nullptr inputs.
+    //!
+    void setInput(int index, ITensor& tensor) _TENSORRT_OVERRIDE TRTNOEXCEPT = 0;
 
 protected:
     virtual ~ISliceLayer() {}
 };
 
+//! \class IShapeLayer
+//!
+//! \brief Layer type for getting shape of a tensor.
+//!
+//! This class sets the output to a one-dimensional tensor with the dimensions of the input tensor.
+//!
+//! For example, if the input is a four-dimensional tensor (of any type) with
+//! dimensions [2,3,5,7], the output tensor is a one-dimensional Int32 tensor
+//! of length 4 containing the sequence 2, 3, 5, 7.
+//!
+//! \warning Do not inherit from this class, as doing so will break forward-compatibility of the API and ABI.
+//!
+class IShapeLayer : public ILayer
+{
+protected:
+    virtual ~IShapeLayer() {}
+};
+
 //!
 //! \enum TopKOperation
 //!
@@ -3820,7 +3428,7 @@ enum class TopKOperation : int
 };
 
 template <>
-inline int EnumMax<TopKOperation>()
+constexpr inline int EnumMax<TopKOperation>()
 {
     return 2;
 } //!< Maximum number of elements in TopKOperation enum. \see TopKOperation
@@ -3840,44 +3448,44 @@ class ITopKLayer : public ILayer
     //!
     //! \see getOperation(), TopKOperation
     //!
-    virtual void setOperation(TopKOperation op) = 0;
+    virtual void setOperation(TopKOperation op) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the operation for the layer.
     //!
     //! \see setOperation(), TopKOperation
     //!
-    virtual TopKOperation getOperation() const = 0;
+    virtual TopKOperation getOperation() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the k value for the layer.
     //!
-    //! Currently only values up to 25 are supported.
+    //! Currently only values up to 3840 are supported.
     //!
     //! \see getK()
     //!
-    virtual void setK(int k) = 0;
+    virtual void setK(int k) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the k value for the layer.
     //!
     //! \see setK()
     //!
-    virtual int getK() const = 0;
+    virtual int getK() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set which axes to reduce for the layer.
     //!
     //! \see getReduceAxes()
     //!
-    virtual void setReduceAxes(uint32_t reduceAxes) = 0;
+    virtual void setReduceAxes(uint32_t reduceAxes) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the axes to reduce for the layer.
     //!
     //! \see setReduceAxes()
     //!
-    virtual uint32_t getReduceAxes() const = 0;
+    virtual uint32_t getReduceAxes() const TRTNOEXCEPT = 0;
 
 protected:
     virtual ~ITopKLayer() {}
@@ -3901,11 +3509,18 @@ enum class MatrixOperation : int
 
     //! Treat x as a vector if it has one dimension, or as a collection of
     //! vectors if x has more than one dimension.  x must have at least one dimension.
+    //! The first input tensor with dimensions [M,K] used with MatrixOperation::kVECTOR is equivalent to a tensor
+    //! with dimensions [M, 1, K] with MatrixOperation::kNONE, i.e. is treated as M row vectors of length K.
+    //! If MatrixOperation::kTRANSPOSE is specified, then the dimensions are [M, K, 1].
+    //!
+    //! The second input tensor with dimensions [M,K] used with MatrixOperation::kVECTOR is equivalent to a tensor
+    //! with dimensions [M, K, 1] with MatrixOperation::kNONE, i.e. is treated as M column vectors of length K.
+    //! If MatrixOperation::kTRANSPOSE is specified, then the dimensions are [M, 1, K].
     kVECTOR
 };
 
 template <>
-inline int EnumMax<MatrixOperation>()
+constexpr inline int EnumMax<MatrixOperation>()
 {
     return 3;
 } //!< Maximum number of elements in MatrixOperation enum. \see DataType
@@ -3944,14 +3559,14 @@ class IMatrixMultiplyLayer : public ILayer
     //! \param op New operation.
     //! \see getTranspose()
     //!
-    virtual void setOperation(int index, MatrixOperation op) = 0;
+    virtual void setOperation(int index, MatrixOperation op) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the operation for an input tensor.
     //! \param index Input tensor number (0 or 1).
     //! \see setTranspose()
     //!
-    virtual MatrixOperation getOperation(int index) const = 0;
+    virtual MatrixOperation getOperation(int index) const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the transpose flag for an input tensor.
@@ -3961,7 +3576,7 @@ class IMatrixMultiplyLayer : public ILayer
     //!
     //! \deprecated setTranspose is superseded by setOperation.
     //!
-    virtual void setTranspose(int index, bool val) = 0;
+    TRT_DEPRECATED virtual void setTranspose(int index, bool val) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the transpose flag for an input tensor.
@@ -3970,7 +3585,7 @@ class IMatrixMultiplyLayer : public ILayer
     //!
     //! \deprecated getTranspose is superseded by getOperation.
     //!
-    virtual bool getTranspose(int index) const = 0;
+    TRT_DEPRECATED virtual bool getTranspose(int index) const TRTNOEXCEPT = 0;
 
 protected:
     virtual ~IMatrixMultiplyLayer() {}
@@ -4023,47 +3638,227 @@ class IConstantLayer : public ILayer
     //! \brief Set the weights for the layer.
     //!
     //! If weights.type is DataType::kINT32, the output is a tensor of 32-bit indices.
-    //! Otherwise the output is a tensor of real values, and the output type will be
-    //! FP32, FP16, or quantized INT8 following TensorRT's normal precision rules.
+    //! Otherwise the output is a tensor of real values and the output type will be
+    //! follow TensorRT's normal precision rules.
     //!
     //! \see getWeights()
     //!
-    virtual void setWeights(Weights weights) = 0;
+    virtual void setWeights(Weights weights) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the weights for the layer.
     //!
     //! \see setWeights
     //!
-    virtual Weights getWeights() const = 0;
+    virtual Weights getWeights() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the dimensions for the layer.
     //!
     //! \param dimensions The dimensions of the layer
     //!
-    //! @see setDimensions
+    //! \see setDimensions
     //!
-    virtual void setDimensions(Dims dimensions) = 0;
+    virtual void setDimensions(Dims dimensions) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the dimensions for the layer.
     //!
     //! \return the dimensions for the layer
     //!
-    //! @see getDimensions
+    //! \see getDimensions
     //!
-    virtual Dims getDimensions() const = 0;
+    virtual Dims getDimensions() const TRTNOEXCEPT = 0;
 
 protected:
     virtual ~IConstantLayer() {}
 };
 
+//!
+//! \class IParametricReLULayer
+//!
+//! \brief Layer that represents a parametric ReLU operation.
+//!
+//! \warning Do not inherit from this class, as doing so will break forward-compatibility of the API and ABI.
+//!
+class IParametricReLULayer : public ILayer
+{
+protected:
+    virtual ~IParametricReLULayer() noexcept {}
+};
+
+//! \enum ResizeMode
+//!
+//! \brief Enumerates various modes of resize in the resize layer.
+//!        Resize mode set using setResizeMode().
+//!
+enum class ResizeMode : int
+{
+    kNEAREST = 0, // N-D (0 < N <= 8) nearest neighbor resizing.
+    kLINEAR = 1   // Can handle linear (1D), bilinear (2D), and trilinear (3D) resizing.
+};
+
+template <>
+constexpr inline int EnumMax<ResizeMode>()
+{
+    return 2;
+} //!< Maximum number of elements in ResizeMode enum. \see ResizeMode
+
+//! \class IResizeLayer
+//!
+//! \brief A resize layer in a network definition.
+//!
+//! Resize layer can be used for resizing a N-D tensor.
+//!
+//! Resize layer currently supports the following configurations:
+//!     -   ResizeMode::kNEAREST - resizes innermost `m` dimensions of N-D, where 0 < m <= min(8, N) and N > 0
+//!     -   ResizeMode::kLINEAR - resizes innermost `m` dimensions of N-D, where 0 < m <= min(3, N) and N > 0
+//!
+//! Default resize mode is ResizeMode::kNEAREST.
+//! Resize layer provides two ways to resize tensor dimensions.
+//!     -   Set output dimensions directly. It can be done for static as well as dynamic resize layer.
+//!         Static resize layer requires output dimensions to be known at build-time.
+//!         Dynamic resize layer requires output dimensions to be set as one of the input tensors.
+//!     -   Set scales for resize. Each output dimension is calculated as floor(input dimension * scale).
+//!         Only static resize layer allows setting scales where the scales are known at build-time.
+//!
+//! \warning Do not inherit from this class, as doing so will break forward-compatibility of the API and ABI.
+//!
+class IResizeLayer : public ILayer
+{
+public:
+    //!
+    //! \brief Set the output dimensions.
+    //!
+    //! \param dimensions The output dimensions. Number of output dimensions must be the same as the number of input dimensions.
+    //!
+    //! If there is a second input, i.e. resize layer is dynamic,
+    //! calling setOutputDimensions() is an error and does not update the
+    //! dimensions.
+    //!
+    //! Output dimensions can be specified directly, or via scale factors relative to input dimensions.
+    //! Scales for resize can be provided using setScales().
+    //!
+    //! \see setScales
+    //! \see getOutputDimensions
+    //!
+    virtual void setOutputDimensions(Dims dimensions) TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Get the output dimensions.
+    //!
+    //! \return The output dimensions.
+    //!
+    virtual Dims getOutputDimensions() const TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Set the resize scales.
+    //!
+    //! \param scales An array of resize scales.
+    //! \param nbScales Number of scales. Number of scales must be equal to the number of input dimensions.
+    //!
+    //! If there is a second input, i.e. resize layer is dynamic,
+    //! calling setScales() is an error and does not update the scales.
+    //!
+    //! Output dimensions are calculated as follows:
+    //! outputDims[i] = floor(inputDims[i] * scales[i])
+    //!
+    //! Output dimensions can be specified directly, or via scale factors relative to input dimensions.
+    //! Output dimensions can be provided directly using setOutputDimensions().
+    //!
+    //! \see setOutputDimensions
+    //! \see getScales
+    //!
+    virtual void setScales(const float* scales, int nbScales) TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Copies resize scales to scales[0, ..., nbScales-1], where nbScales is the number of scales that were set.
+    //!
+    //! \param size The number of scales to get. If size != nbScales, no scales will be copied.
+    //!
+    //! \param scales Pointer to where to copy the scales. Scales will be copied only if
+    //!               size == nbScales and scales != nullptr.
+    //!
+    //! In case the size is not known consider using size = 0 and scales = nullptr. This method will return
+    //! the number of resize scales.
+    //!
+    //! \return The number of resize scales i.e. nbScales if scales were set.
+    //!         Return -1 in case no scales were set or resize layer is used in dynamic mode.
+    //!
+    virtual int getScales(int size, float* scales) const TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Set resize mode for an input tensor.
+    //!
+    //! Supported resize modes are Nearest Neighbor and Linear.
+    //!
+    //! \see ResizeMode
+    //!
+    virtual void setResizeMode(ResizeMode resizeMode) TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Get resize mode for an input tensor.
+    //!
+    //! \return The resize mode.
+    //!
+    virtual ResizeMode getResizeMode() const TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Set whether to align corners while resizing.
+    //!
+    //! If true, the centers of the 4 corner pixels of both input and output
+    //! tensors are aligned i.e. preserves the values of corner
+    //! pixels.
+    //!
+    //! Default: false.
+    //!
+    virtual void setAlignCorners(bool alignCorners) TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief True if align corners has been set.
+    //!
+    //! \return True if align corners has been set, false otherwise.
+    //!
+    virtual bool getAlignCorners() const TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Relaxes ILayer::setInput to allow appending a second input.
+    //!
+    //! \param index the index of the input to modify.
+    //! \param tensor the new input tensor.
+    //!
+    //! Like ILayer::setInput, but additionally works if index == 1 and nbInputs == 1, and
+    //! there is no implicit batch dimension, in which case nbInputs() changes to 2.
+    //! Once such additional input is set, resize layer works in dynamic mode.
+    //!
+    //! When index == 1 and nbInputs == 1, the output dimensions are used from
+    //! the input tensor, overriding the dimensions supplied by setOutputDimensions.
+    //!
+    //! \warning tensor must be a shape tensor.
+    //!
+    void setInput(int index, ITensor& tensor) _TENSORRT_OVERRIDE TRTNOEXCEPT = 0;
+
+protected:
+    virtual ~IResizeLayer() {}
+};
+
 //!
 //! \class INetworkDefinition
 //!
 //! \brief A network definition for input to the builder.
 //!
+//! A network definition defines the structure of the network, and combined with a IBuilderConfig, is built
+//! into an engine using an IBuilder. An INetworkDefinition can either have an implicit batch dimensions, specified
+//! at runtime, or all dimensions explicit, full dims mode, in the network definition. When a network has been
+//! created using createNetwork(), only implicit batch size mode is supported. The function hasImplicitBatchSize()
+//! is used to query the mode of the network.
+//!
+//! A network with implicit batch dimensions returns the dimensions of a layer without the implicit dimension,
+//! and instead the batch is specified at execute/enqueue time. If the network has all dimensions specified, then
+//! the first dimension follows elementwise broadcast rules: if it is 1 for some inputs and is some value N for all
+//! other inputs, then the first dimension of each outut is N, and the inputs with 1 for the first dimension are
+//! broadcast. Having divergent batch sizes across inputs to a layer is not supported.
+//!
 //! \warning Do not inherit from this class, as doing so will break forward-compatibility of the API and ABI.
 //!
 class INetworkDefinition
@@ -4072,27 +3867,39 @@ class INetworkDefinition
     //!
     //! \brief Add an input tensor to the network.
     //!
-    //! The name of the input tensor is used to find the index into the buffer array for an engine built from the network.
+    //! The name of the input tensor is used to find the index into the buffer array for an engine built from
+    //! the network. The volume of the dimensions must be less than 2^30 elements.
+
+    //! For networks with an implicit batch dimension, this volume includes the batch dimension with its length set
+    //! to the maximum batch size. For networks with all explicit dimensions and with wildcard dimensions, the volume
+    //! is based on the maxima specified by an IOptimizationProfile.Dimensions are normally positive integers. The
+    //! exception is that in networks with all explicit dimensions, -1 can be used as a wildcard for a dimension to
+    //! be specified at runtime. Input tensors with such a wildcard must have a corresponding entry in the
+    //! IOptimizationProfiles indicating the permitted extrema, and the input dimensions must be set by
+    //! IExecutionContext::setBindingDimensions. Different IExecutionContext instances can have different dimensions.
+    //! Wildcard dimensions are only supported for EngineCapability::kDEFAULT with DeviceType::kGPU. They are not
+    //! supported in safety contexts or on the DLA.
     //!
     //! \param name The name of the tensor.
     //! \param type The type of the data held in the tensor.
     //! \param dimensions The dimensions of the tensor.
     //!
-    //! Only DataType::kFLOAT, DataType::kHALF and DataType::kINT32 are valid input tensor types.
-    //! The volume of the dimensions, including the maximum batch size, must be less than 2^30 elements.
+    //! \warning It is an error to specify a wildcard value on a dimension that is determined by trained parameters.
     //!
     //! \see ITensor
     //!
     //! \return The new tensor or nullptr if there is an error.
     //!
-    virtual ITensor* addInput(const char* name, DataType type, Dims dimensions) = 0;
+    virtual ITensor* addInput(const char* name, DataType type, Dims dimensions) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Mark a tensor as a network output.
     //!
     //! \param tensor The tensor to mark as an output tensor.
     //!
-    virtual void markOutput(ITensor& tensor) = 0;
+    //! \warning It is an error to mark a network input as an output.
+    //!
+    virtual void markOutput(ITensor& tensor) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Add a convolution layer to the network.
@@ -4105,9 +3912,13 @@ class INetworkDefinition
     //!
     //! \see IConvolutionLayer
     //!
+    //! \warning It is an error to specify a wildcard value for the 'C' dimension of the input tensor.
+    //! \warning Int32 tensors are not valid input tensors.
+    //!
     //! \return The new convolution layer, or nullptr if it could not be created.
     //!
-    virtual IConvolutionLayer* addConvolution(ITensor& input, int nbOutputMaps, DimsHW kernelSize, Weights kernelWeights, Weights biasWeights) = 0;
+    virtual IConvolutionLayer* addConvolution(ITensor& input, int nbOutputMaps, DimsHW kernelSize,
+        Weights kernelWeights, Weights biasWeights) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Add a fully connected layer to the network.
@@ -4119,9 +3930,13 @@ class INetworkDefinition
     //!
     //! \see IFullyConnectedLayer
     //!
+    //! \warning It is an error to specify a wildcard value for the 'C' dimension of the input tensor.
+    //! \warning Int32 tensors are not valid input tensors.
+    //!
     //! \return The new fully connected layer, or nullptr if it could not be created.
     //!
-    virtual IFullyConnectedLayer* addFullyConnected(ITensor& input, int nbOutputs, Weights kernelWeights, Weights biasWeights) = 0;
+    virtual IFullyConnectedLayer* addFullyConnected(
+        ITensor& input, int nbOutputs, Weights kernelWeights, Weights biasWeights) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Add an activation layer to the network.
@@ -4133,10 +3948,11 @@ class INetworkDefinition
     //! output for activations that require these parameters.
     //!
     //! \see IActivationLayer ActivationType
+    //! \warning Int32 tensors are not valid input tensors.
     //!
     //! \return The new activation layer, or nullptr if it could not be created.
     //!
-    virtual IActivationLayer* addActivation(ITensor& input, ActivationType type) = 0;
+    virtual IActivationLayer* addActivation(ITensor& input, ActivationType type) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Add a pooling layer to the network.
@@ -4146,10 +3962,11 @@ class INetworkDefinition
     //! \param windowSize The size of the pooling window.
     //!
     //! \see IPoolingLayer PoolingType
+    //! \warning Int32 tensors are not valid input tensors.
     //!
     //! \return The new pooling layer, or nullptr if it could not be created.
     //!
-    virtual IPoolingLayer* addPooling(ITensor& input, PoolingType type, DimsHW windowSize) = 0;
+    virtual IPoolingLayer* addPooling(ITensor& input, PoolingType type, DimsHW windowSize) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Add a LRN layer to the network.
@@ -4161,39 +3978,42 @@ class INetworkDefinition
     //! \param k The k value for the LRN computation.
     //!
     //! \see ILRNLayer
+    //! \warning Int32 tensors are not valid input tensors.
     //!
     //! \return The new LRN layer, or nullptr if it could not be created.
     //!
-    virtual ILRNLayer* addLRN(ITensor& input, int window, float alpha, float beta, float k) = 0;
+    virtual ILRNLayer* addLRN(ITensor& input, int window, float alpha, float beta, float k) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Add a Scale layer to the network.
     //!
-    //! \param input The input tensor to The layer. This tensor is required to have a minimum of 3 dimensions.
+    //! \param input The input tensor to the layer. This tensor is required to have a minimum of 3 dimensions.
     //! \param mode The scaling mode.
     //! \param shift The shift value.
     //! \param scale The scale value.
     //! \param power The power value.
     //!
-    //! If the weights are available, then the size of weights are dependent on the on the ScaleMode.
+    //! If the weights are available, then the size of weights are dependent on the ScaleMode.
     //! For ::kUNIFORM, the number of weights is equal to 1.
     //! For ::kCHANNEL, the number of weights is equal to the channel dimension.
     //! For ::kELEMENTWISE, the number of weights is equal to the volume of the input.
     //!
     //! \see IScaleLayer
+    //! \warning Int32 tensors are not valid input tensors.
     //!
     //! \return The new Scale layer, or nullptr if it could not be created.
     //!
-    virtual IScaleLayer* addScale(ITensor& input, ScaleMode mode, Weights shift, Weights scale, Weights power) = 0;
+    virtual IScaleLayer* addScale(ITensor& input, ScaleMode mode, Weights shift, Weights scale, Weights power) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Add a SoftMax layer to the network.
     //!
     //! \see ISoftMaxLayer
+    //! \warning Int32 tensors are not valid input tensors.
     //!
     //! \return The new SoftMax layer, or nullptr if it could not be created.
     //!
-    virtual ISoftMaxLayer* addSoftMax(ITensor& input) = 0;
+    virtual ISoftMaxLayer* addSoftMax(ITensor& input) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Add a concatenation layer to the network.
@@ -4207,22 +4027,26 @@ class INetworkDefinition
     //!
     //! \warning All tensors must have the same dimensions for all dimensions except for channel.
     //!
-    virtual IConcatenationLayer* addConcatenation(ITensor* const* inputs, int nbInputs) = 0;
+    virtual IConcatenationLayer* addConcatenation(ITensor* const* inputs, int nbInputs) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Add a deconvolution layer to the network.
     //!
     //! \param input The input tensor to the layer.
     //! \param nbOutputMaps The number of output feature maps.
-    //! \param kernelSize The HW-dimensions of the convolution kernel.
-    //! \param kernelWeights The kernel weights for the convolution.
-    //! \param biasWeights The optional bias weights for the convolution.
+    //! \param kernelSize The HW-dimensions of the deconvolution kernel.
+    //! \param kernelWeights The kernel weights for the deconvolution.
+    //! \param biasWeights The optional bias weights for the deconvolution.
     //!
     //! \see IDeconvolutionLayer
     //!
+    //! \warning It is an error to specify a wildcard value for the 'C' dimension of the input tensor.
+    //! \warning Int32 tensors are not valid input tensors.
+    //!
     //! \return The new deconvolution layer, or nullptr if it could not be created.
     //!
-    virtual IDeconvolutionLayer* addDeconvolution(ITensor& input, int nbOutputMaps, DimsHW kernelSize, Weights kernelWeights, Weights biasWeights) = 0;
+    virtual IDeconvolutionLayer* addDeconvolution(ITensor& input, int nbOutputMaps, DimsHW kernelSize,
+        Weights kernelWeights, Weights biasWeights) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Add an elementwise layer to the network.
@@ -4240,10 +4064,11 @@ class INetworkDefinition
     //! corresponding input dimension.
     //!
     //! \see IElementWiseLayer
+    //! \warning For shape tensors, ElementWiseOperation::kPOW is not a valid op.
     //!
     //! \return The new elementwise layer, or nullptr if it could not be created.
     //!
-    virtual IElementWiseLayer* addElementWise(ITensor& input1, ITensor& input2, ElementWiseOperation op) = 0;
+    virtual IElementWiseLayer* addElementWise(ITensor& input1, ITensor& input2, ElementWiseOperation op) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Add an \p layerCount deep RNN layer to the network with a
@@ -4298,9 +4123,13 @@ class INetworkDefinition
     //!
     //! \see IRNNLayer
     //!
+    //! \warning RNN inputs do not support wildcard dimensions or explicit batch size networks.
+    //! \warning Int32 tensors are not valid input tensors.
+    //!
     //! \return The new RNN layer, or nullptr if it could not be created.
     //!
-    virtual IRNNLayer* addRNN(ITensor& inputs, int layerCount, std::size_t hiddenSize, int maxSeqLen, RNNOperation op, RNNInputMode mode, RNNDirection dir, Weights weights, Weights bias) = 0;
+    TRT_DEPRECATED virtual IRNNLayer* addRNN(ITensor& inputs, int layerCount, std::size_t hiddenSize, int maxSeqLen,
+        RNNOperation op, RNNInputMode mode, RNNDirection dir, Weights weights, Weights bias) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Add a plugin layer to the network.
@@ -4311,9 +4140,15 @@ class INetworkDefinition
     //!
     //! \see IPluginLayer
     //!
+    //! \deprecated IPluginLayer is superseded by IPluginV2. use addPluginV2 instead.
+    //!
+    //! \warning Plugin inputs do not support wildcard dimensions or explicit batch size networks.
+    //! \warning Int32 tensors are not valid input tensors.
+    //!
     //! \return the new plugin layer, or nullptr if it could not be created.
     //!
-    virtual IPluginLayer* addPlugin(ITensor* const* inputs, int nbInputs, IPlugin& plugin) = 0;
+    TRT_DEPRECATED virtual IPluginLayer* addPlugin(
+        ITensor* const* inputs, int nbInputs, IPlugin& plugin) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Add a unary layer to the network.
@@ -4323,9 +4158,11 @@ class INetworkDefinition
     //!
     //! \see IUnaryLayer
     //!
+    //! \warning Int32 tensors are not valid input tensors.
+    //!
     //! \return The new unary layer, or nullptr if it could not be created
     //!
-    virtual IUnaryLayer* addUnary(ITensor& input, UnaryOperation operation) = 0;
+    virtual IUnaryLayer* addUnary(ITensor& input, UnaryOperation operation) TRTNOEXCEPT = 0;
 
     //! \brief Add a padding layer to the network.
     //!
@@ -4337,7 +4174,7 @@ class INetworkDefinition
     //!
     //! \return The new padding layer, or nullptr if it could not be created.
     //!
-    virtual IPaddingLayer* addPadding(ITensor& input, DimsHW prePadding, DimsHW postPadding) = 0;
+    virtual IPaddingLayer* addPadding(ITensor& input, DimsHW prePadding, DimsHW postPadding) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Add a shuffle layer to the network.
@@ -4348,40 +4185,49 @@ class INetworkDefinition
     //!
     //! \return The new shuffle layer, or nullptr if it could not be created.
     //!
-    virtual IShuffleLayer* addShuffle(ITensor& input) = 0;
+    virtual IShuffleLayer* addShuffle(ITensor& input) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the pooling output dimensions formula.
     //!
-    //! \param formula The formula from computing the pooling output dimensions. If null is passed, the default formula is used.
+    //! \param formula The formula from computing the pooling output dimensions. If null is passed, the default
+    //! formula is used.
     //!
     //! The default formula in each dimension is (inputDim + padding * 2 - kernelSize) / stride + 1.
     //!
+    //! \warning Custom output dimensions formulas are not supported with wildcard dimensions.
+    //!
     //! \see IOutputDimensionsFormula getPoolingOutputDimensionsFormula()
     //!
-    virtual void setPoolingOutputDimensionsFormula(IOutputDimensionsFormula* formula) = 0;
+    TRT_DEPRECATED virtual void setPoolingOutputDimensionsFormula(IOutputDimensionsFormula* formula) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the pooling output dimensions formula.
     //!
     //! \return The formula from computing the pooling output dimensions.
     //!
+    //! \warning Custom output dimensions formulas are not supported with wildcard dimensions.
+    //!
     //! \see IOutputDimensionsFormula setPoolingOutputDimensionsFormula()
     //!
-    virtual IOutputDimensionsFormula& getPoolingOutputDimensionsFormula() const = 0;
+    TRT_DEPRECATED virtual IOutputDimensionsFormula& getPoolingOutputDimensionsFormula() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the convolution output dimensions formula.
     //!
     //! \deprecated This method does not currently work reliably and will be removed in a future release.
     //!
-    //! \param formula The formula from computing the convolution output dimensions. If null is passed, the default formula is used.
+    //! \param formula The formula from computing the convolution output dimensions. If null is passed, the default
+    //! formula is used.
     //!
     //! The default formula in each dimension is (inputDim + padding * 2 - kernelSize) / stride + 1.
     //!
+    //! \warning Custom output dimensions formulas are not supported with wildcard dimensions.
+    //!
     //! \see IOutputDimensionsFormula getConvolutionOutputDimensionsFormula()
     //!
-    virtual void setConvolutionOutputDimensionsFormula(IOutputDimensionsFormula* formula) = 0;
+    TRT_DEPRECATED virtual void setConvolutionOutputDimensionsFormula(
+        IOutputDimensionsFormula* formula) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the convolution output dimensions formula.
@@ -4390,22 +4236,28 @@ class INetworkDefinition
     //!
     //! \return The formula from computing the convolution output dimensions.
     //!
+    //! \warning Custom output dimensions formulas are not supported with wildcard dimensions.
+    //!
     //! \see IOutputDimensionsFormula setConvolutionOutputDimensionsFormula()
     //!
-    virtual IOutputDimensionsFormula& getConvolutionOutputDimensionsFormula() const = 0;
+    TRT_DEPRECATED virtual IOutputDimensionsFormula& getConvolutionOutputDimensionsFormula() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the deconvolution output dimensions formula.
     //!
     //! \deprecated This method does not currently work reliably and will be removed in a future release.
     //!
-    //! \param formula The formula from computing the deconvolution output dimensions. If null is passed, the default formula is used.
+    //! \param formula The formula from computing the deconvolution output dimensions. If null is passed, the default!
+    //! formula is used.
     //!
     //! The default formula in each dimension is (inputDim - 1) * stride + kernelSize - 2 * padding.
     //!
+    //! \warning Custom output dimensions formulas are not supported with wildcard dimensions.
+    //!
     //! \see IOutputDimensionsFormula getDevonvolutionOutputDimensionsFormula()
     //!
-    virtual void setDeconvolutionOutputDimensionsFormula(IOutputDimensionsFormula* formula) = 0;
+    TRT_DEPRECATED virtual void setDeconvolutionOutputDimensionsFormula(
+        IOutputDimensionsFormula* formula) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the deconvolution output dimensions formula.
@@ -4414,9 +4266,11 @@ class INetworkDefinition
     //!
     //! \deprecated This method does not currently work reliably and will be removed in a future release.
     //!
+    //! \warning Custom output dimensions formulas are not supported with wildcard dimensions.
+    //!
     //! \see IOutputDimensionsFormula setDeconvolutionOutputDimensionsFormula()
     //!
-    virtual IOutputDimensionsFormula& getDeconvolutionOutputDimensionsFormula() const = 0;
+    TRT_DEPRECATED virtual IOutputDimensionsFormula& getDeconvolutionOutputDimensionsFormula() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the number of layers in the network.
@@ -4425,7 +4279,7 @@ class INetworkDefinition
     //!
     //! \see getLayer()
     //!
-    virtual int getNbLayers() const = 0;
+    virtual int getNbLayers() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the layer specified by the given index.
@@ -4436,7 +4290,7 @@ class INetworkDefinition
     //!
     //! \see getNbLayers()
     //!
-    virtual ILayer* getLayer(int index) const = 0;
+    virtual ILayer* getLayer(int index) const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the number of inputs in the network.
@@ -4445,7 +4299,7 @@ class INetworkDefinition
     //!
     //! \see getInput()
     //!
-    virtual int getNbInputs() const = 0;
+    virtual int getNbInputs() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the input tensor specified by the given index.
@@ -4456,16 +4310,18 @@ class INetworkDefinition
     //!
     //! \see getNbInputs()
     //!
-    virtual ITensor* getInput(int index) const = 0; // adding inputs invalidates indexing here
+    virtual ITensor* getInput(int index) const TRTNOEXCEPT = 0; // adding inputs invalidates indexing here
 
     //!
     //! \brief Get the number of outputs in the network.
     //!
+    //! The outputs include those marked by markOutput or markOutputForShapes.
+    //!
     //! \return The number of outputs in the network.
     //!
     //! \see getOutput()
     //!
-    virtual int getNbOutputs() const = 0;
+    virtual int getNbOutputs() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the output tensor specified by the given index.
@@ -4476,12 +4332,12 @@ class INetworkDefinition
     //!
     //! \see getNbOutputs()
     //!
-    virtual ITensor* getOutput(int index) const = 0; // adding outputs invalidates indexing here
+    virtual ITensor* getOutput(int index) const TRTNOEXCEPT = 0; // adding outputs invalidates indexing here
 
     //!
     //! \brief Destroy this INetworkDefinition object.
     //!
-    virtual void destroy() = 0;
+    virtual void destroy() TRTNOEXCEPT = 0;
 
 protected:
     virtual ~INetworkDefinition() {}
@@ -4493,20 +4349,20 @@ class INetworkDefinition
     //! \param input The input tensor to the layer.
     //! \param operation The reduction operation to perform.
     //! \param reduceAxes The reduction dimensions.
-    //!        Bit 0 of the uint32_t type corresponds to the non-batch dimension 0 boolean and so on.
-    //!        If a bit is set, then the corresponding dimension will be reduced.
-    //!        Let's say we have an NCHW tensor as input (three non-batch dimensions).
-    //!        Bit 0 corresponds to the C dimension boolean.
-    //!        Bit 1 corresponds to the H dimension boolean.
-    //!        Bit 2 corresponds to the W dimension boolean.
-    //!        Note that reduction is not permitted over the batch size dimension.
-    //! \param keepDimensions The boolean that specifies whether or not to keep the reduced dimensions in the output of the layer.
+    //!        The bit in position i of bitmask reduceAxes corresponds to explicit dimension i if result.
+    //!        E.g., the least significant bit corresponds to the first explicit dimension and the next to least
+    //!        significant bit corresponds to the second explicit dimension.
+    //!
+    //! \param keepDimensions The boolean that specifies whether or not to keep the reduced dimensions in the
+    //! output of the layer.
     //!
     //! \see IReduceLayer
     //!
+    //! \warning If input is a shape tensor, ReduceOperation::kAVG is unsupported.
+    //!
     //! \return The new reduce layer, or nullptr if it could not be created.
     //!
-    virtual IReduceLayer* addReduce(ITensor& input, ReduceOperation operation, uint32_t reduceAxes, bool keepDimensions) = 0;
+    virtual IReduceLayer* addReduce(ITensor& input, ReduceOperation operation, uint32_t reduceAxes, bool keepDimensions) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Add a TopK layer to the network.
@@ -4524,32 +4380,32 @@ class INetworkDefinition
     //! \param k Number of elements to keep.
     //!
     //! \param reduceAxes The reduction dimensions.
-    //!        Bit 0 of the uint32_t type corresponds to the non-batch dimension 0 boolean and so on.
-    //!        If a bit is set, then the corresponding dimension will be reduced.
-    //!        Let's say we have an NCHW tensor as input (three non-batch dimensions).
-    //!        Bit 0 corresponds to the C dimension boolean.
-    //!        Bit 1 corresponds to the H dimension boolean.
-    //!        Bit 2 corresponds to the W dimension boolean.
-    //!        Note that TopK reduction is currently only permitted over one dimension.
+    //!        The bit in position i of bitmask reduceAxes corresponds to explicit dimension i of the result.
+    //!        E.g., the least significant bit corresponds to the first explicit dimension and the next to least
+    //!        significant bit corresponds to the second explicit dimension.
+    //!
+    //!        Currently reduceAxes must specify exactly one dimension, and it must be one of the last four dimensions.
     //!
     //! \see ITopKLayer
     //!
+    //! \warning Int32 tensors are not valid input tensors.
+    //!
     //! \return The new TopK layer, or nullptr if it could not be created.
     //!
-    virtual ITopKLayer* addTopK(ITensor& input, TopKOperation op, int k, uint32_t reduceAxes) = 0;
+    virtual ITopKLayer* addTopK(ITensor& input, TopKOperation op, int k, uint32_t reduceAxes) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Add a gather layer to the network.
     //!
     //! \param data The tensor to gather values from.
     //! \param indices The tensor to get indices from to populate the output tensor.
-    //! \param axis The non-batch dimension axis in the data tensor to gather on.
+    //! \param axis The axis in the data tensor to gather on.
     //!
     //! \see IGatherLayer
     //!
     //! \return The new gather layer, or nullptr if it could not be created.
     //!
-    virtual IGatherLayer* addGather(ITensor& data, ITensor& indices, int axis) = 0;
+    virtual IGatherLayer* addGather(ITensor& data, ITensor& indices, int axis) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Add a RaggedSoftMax layer to the network.
@@ -4559,9 +4415,12 @@ class INetworkDefinition
     //!
     //! \see IRaggedSoftMaxLayer
     //!
+    //! \warning The bounds tensor cannot have the last dimension be the wildcard character.
+    //! \warning Int32 tensors are not valid input tensors.
+    //!
     //! \return The new RaggedSoftMax layer, or nullptr if it could not be created.
     //!
-    virtual IRaggedSoftMaxLayer* addRaggedSoftMax(ITensor& input, ITensor& bounds) = 0;
+    virtual IRaggedSoftMaxLayer* addRaggedSoftMax(ITensor& input, ITensor& bounds) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Add a MatrixMultiply layer to the network.
@@ -4573,9 +4432,12 @@ class INetworkDefinition
     //!
     //! \see IMatrixMultiplyLayer
     //!
+    //! \warning Int32 tensors are not valid input tensors.
+    //!
     //! \return The new matrix multiply layer, or nullptr if it could not be created.
     //!
-    virtual IMatrixMultiplyLayer* addMatrixMultiply(ITensor& input0, MatrixOperation op0, ITensor& input1, MatrixOperation op1) = 0;
+    virtual IMatrixMultiplyLayer* addMatrixMultiply(
+        ITensor& input0, MatrixOperation op0, ITensor& input1, MatrixOperation op1) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Add a MatrixMultiply layer to the network.
@@ -4589,9 +4451,12 @@ class INetworkDefinition
     //!
     //! \return The new matrix multiply layer, or nullptr if it could not be created.
     //!
+    //! \warning Int32 tensors are not valid input tensors.
+    //!
     //! \deprecated This interface is superseded by the overload that replaces bool with MatrixOperation.
     //!
-    virtual IMatrixMultiplyLayer* addMatrixMultiply(ITensor& input0, bool transpose0, ITensor& input1, bool transpose1) = 0;
+    TRT_DEPRECATED virtual IMatrixMultiplyLayer* addMatrixMultiply(
+        ITensor& input0, bool transpose0, ITensor& input1, bool transpose1) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Add a constant layer to the network.
@@ -4604,10 +4469,16 @@ class INetworkDefinition
     //! \return The new constant layer, or nullptr if it could not be created.
     //!
     //! If weights.type is DataType::kINT32, the output is a tensor of 32-bit indices.
-    //! Otherwise the output is a tensor of real values, and the output type will be
-    //! FP32, FP16, or quantized INT8 following TensorRT's normal precision rules.
+    //! Otherwise the output is a tensor of real values and the output type will be
+    //! follow TensorRT's normal precision rules.
+    //!
+    //! If tensors in the network have an implicit batch dimension, the constant
+    //! is broadcast over that dimension.
     //!
-    virtual IConstantLayer* addConstant(Dims dimensions, Weights weights) = 0;
+    //! If a wildcard dimension is used, the volume of the runtime dimensions must equal
+    //! the number of weights specified.
+    //!
+    virtual IConstantLayer* addConstant(Dims dimensions, Weights weights) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Add an \p layerCount deep RNN layer to the network with \p hiddenSize internal states that can
@@ -4656,627 +4527,858 @@ class INetworkDefinition
     //! of the RNN across all layers.  Both the second and third output tensors have dimensions
     //! `{N1, ..., Np, L, H}`:
     //!
-    //!  - `N1..Np` are the index dimensions specified by the input tensor
-    //!  - `L` is the number of layers in the RNN, equal to getLayerCount() if getDirection is ::kUNIDIRECTION,
-    //!     and 2x getLayerCount() if getDirection is ::kBIDIRECTION. In the bi-directional
-    //!     case, layer `l`'s final forward hidden state is stored in `L = 2*l`, and
-    //!     final backward hidden state is stored in `L= 2*l + 1`.
-    //!  - `H` is the hidden state for each layer, equal to getHiddenSize().
+    //!  - `N1..Np` are the index dimensions specified by the input tensor
+    //!  - `L` is the number of layers in the RNN, equal to getLayerCount() if getDirection is ::kUNIDIRECTION,
+    //!     and 2x getLayerCount() if getDirection is ::kBIDIRECTION. In the bi-directional
+    //!     case, layer `l`'s final forward hidden state is stored in `L = 2*l`, and
+    //!     final backward hidden state is stored in `L= 2*l + 1`.
+    //!  - `H` is the hidden state for each layer, equal to getHiddenSize().
+    //!
+    //! \see IRNNv2Layer
+    //!
+    //! \warning RNN inputs do not support wildcard dimensions or explicit batch size networks.
+    //! \warning Int32 tensors are not valid input tensors.
+    //!
+    //! \return The new RNN layer, or nullptr if it could not be created.
+    //!
+    virtual IRNNv2Layer* addRNNv2(
+        ITensor& input, int32_t layerCount, int32_t hiddenSize, int32_t maxSeqLen, RNNOperation op) TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Add a plugin layer to the network using an IPluginExt interface.
+    //!
+    //! \param inputs The input tensors to the layer.
+    //! \param nbInputs The number of input tensors.
+    //! \param plugin The layer plugin.
+    //!
+    //! \see IPluginLayer
+    //!
+    //! \deprecated IPluginLayer is superseded by IPluginV2. use addPluginV2 instead.
+    //!
+    //! \warning Plugin inputs do not support wildcard dimensions or explicit batch size networks.
+    //! \warning Int32 tensors are not valid input tensors.
+    //!
+    //! \return The new plugin layer, or nullptr if it could not be created.
+    //!
+    TRT_DEPRECATED virtual IPluginLayer* addPluginExt(
+        ITensor* const* inputs, int nbInputs, IPluginExt& plugin) TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Add an identity layer.
+    //!
+    //! \param input The input tensor to the layer.
+    //!
+    //! \see IIdentityLayer
+    //!
+    //! \warning Int32 tensors are not valid input tensors.
+    //!
+    //! \return The new identity layer, or nullptr if it could not be created.
+    //!
+    virtual IIdentityLayer* addIdentity(ITensor& input) TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief remove a tensor from the network definition.
+    //!
+    //! \param tensor the tensor to remove
+    //!
+    //! It is illegal to remove a tensor that is the input or output of a layer.
+    //! if this method is called with such a tensor, a warning will be emitted on the log
+    //! and the call will be ignored. Its intended use is to remove detached tensors after
+    //! e.g. concatenating two networks with Layer::setInput().
+    //!
+    virtual void removeTensor(ITensor& tensor) TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief unmark a tensor as a network output.
+    //!
+    //! \param tensor The tensor to unmark as an output tensor.
+    //!
+    //! see markOutput()
+    //!
+    virtual void unmarkOutput(ITensor& tensor) TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Add a plugin layer to the network using the IPluginV2 interface.
+    //!
+    //! \param inputs The input tensors to the layer.
+    //! \param nbInputs The number of input tensors.
+    //! \param plugin The layer plugin.
+    //!
+    //! \see IPluginV2Layer
+    //!
+    //! \warning Dimension wildcard are only supported with IPluginV2DynamicExt or IPluginV2IOExt plugins.
+    //! \warning Int32 tensors are not valid input tensors.
+    //!
+    //! \return The new plugin layer, or nullptr if it could not be created.
+    //!
+    virtual IPluginV2Layer* addPluginV2(ITensor* const* inputs, int nbInputs, IPluginV2& plugin) TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Add a slice layer to the network.
+    //!
+    //! \param input The input tensor to the layer.
+    //! \param start The start offset
+    //! \param size The output dimension
+    //! \param stride The slicing stride
+    //!
+    //! Positive, negative, zero stride values, and combinations of them in different dimensions are allowed.
+    //!
+    //! \see ISliceLayer
+    //!
+    //! \return The new slice layer, or nullptr if it could not be created.
+    //!
+    virtual ISliceLayer* addSlice(ITensor& input, Dims start, Dims size, Dims stride) TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Sets the name of the network.
+    //!
+    //! \param name The name to assign to this network.
+    //!
+    //! Set the name of the network so that it can be associated with a built
+    //! engine. The \p name must be a zero delimited C-style string of length
+    //! no greater than 128 characters. TensorRT makes no use of this string
+    //! except storing it as part of the engine so that it may be retrieved at
+    //! runtime. A name unique to the builder will be generated by default.
+    //!
+    //! This method copies the name string.
+    //!
+    //! \see INetworkDefinition::getName(), ISafeCudaEngine::getName()
+    //!
+    //! \return none
+    //!
+    virtual void setName(const char* name) TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Returns the name associated with the network.
+    //!
+    //! The memory pointed to by getName() is owned by the INetworkDefinition object.
+    //!
+    //! \see INetworkDefinition::setName()
+    //!
+    //! \return A zero delimited C-style string representing the name of the network.
+    //!
+    virtual const char* getName() const TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Add a shape layer to the network.
+    //!
+    //! \param input The input tensor to the layer.
+    //!
+    //! \see IShapeLayer
+    //!
+    //! \warning addShape is only supported when hasImplicitBatchDimensions is false.
+    //!
+    //! \warning input to addShape cannot contain wildcard dimension values.
+    //!
+    //! \return The new shape layer, or nullptr if it could not be created.
+    //!
+    virtual IShapeLayer* addShape(ITensor& input) TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief True if tensors have implicit batch dimension.
+    //!
+    //! \return True if tensors have implicit batch dimension, false otherwise.
+    //!
+    //! This is a network-wide property.  Either all tensors in the network
+    //! have an implicit batch dimension or none of them do.
+    //!
+    //! hasImplicitBatchDimension() is true if and only if this INetworkDefinition
+    //! was created with createNetwork() or createNetworkV2() without
+    //! NetworkDefinitionCreationFlag::kEXPLICIT_BATCH flag.
+    //!
+    //! \see createNetworkV2
+    //!
+    virtual bool hasImplicitBatchDimension() const TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Enable tensor's value to be computed by IExecutionContext::getShapeBinding.
+    //!
+    //! \return True if successful, false if tensor is already marked as an output.
+    //!
+    //! The tensor must be of type DataType::kINT32 and have no more than one dimension.
+    //!
+    //! \warning The tensor must have dimensions that can be determined to be constants at build time.
+    //!
+    //! \warning It is an error to mark a network input as a shape output.
+    //!
+    //! \see isShapeBinding(), getShapeBinding()
+    //!
+    virtual bool markOutputForShapes(ITensor& tensor) TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Undo markOutputForShapes.
+    //!
+    //! \warning inputs to addShape cannot contain wildcard dimension values.
+    //!
+    //! \return True if successful, false if tensor is not marked as an output.
+    //!
+    virtual bool unmarkOutputForShapes(ITensor& tensor) TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Add a parametric ReLU layer to the network.
+    //!
+    //! \param input The input tensor to the layer.
+    //! \param slope The slope tensor to the layer. This tensor should be unidirectionally broadcastable
+    //!        to the input tensor.
+    //!
+    //! \see IParametricReLULayer
     //!
-    //! \see IRNNv2Layer
+    //! \warning Int32 tensors are not valid input tensors.
     //!
-    //! \return The new RNN layer, or nullptr if it could not be created.
+    //! \return The new parametric ReLU layer, or nullptr if it could not be created.
     //!
-    virtual IRNNv2Layer* addRNNv2(ITensor& input, int32_t layerCount, int32_t hiddenSize, int32_t maxSeqLen, RNNOperation op) = 0;
+    virtual IParametricReLULayer* addParametricReLU(ITensor& input, ITensor& slope) noexcept = 0;
 
     //!
-    //! \brief Add a plugin layer to the network using an IPluginExt interface.
+    //! \brief Add a multi-dimension convolution layer to the network.
     //!
-    //! \param inputs The input tensors to the layer.
-    //! \param nbInputs The number of input tensors.
-    //! \param plugin The layer plugin.
+    //! \param input The input tensor to the convolution.
+    //! \param nbOutputMaps The number of output feature maps for the convolution.
+    //! \param kernelSize The multi-dimensions of the convolution kernel.
+    //! \param kernelWeights The kernel weights for the convolution.
+    //! \param biasWeights The optional bias weights for the convolution.
     //!
-    //! \see IPluginLayer
+    //! \see IConvolutionLayer
     //!
-    //! \return The new plugin layer, or nullptr if it could not be created.
+    //! \warning It is an error to specify a wildcard value for the 'C' dimension of the input tensor.
+    //! \warning Int32 tensors are not valid input tensors.
+    //! \warning Only 2D or 3D convolution is supported.
+    //!
+    //! \return The new convolution layer, or nullptr if it could not be created.
     //!
-    virtual IPluginLayer* addPluginExt(ITensor* const* inputs, int nbInputs, IPluginExt& plugin) = 0;
+    virtual IConvolutionLayer* addConvolutionNd(
+        ITensor& input, int nbOutputMaps, Dims kernelSize, Weights kernelWeights, Weights biasWeights) TRTNOEXCEPT = 0;
 
     //!
-    //! \brief Add an identity layer.
+    //! \brief Add a multi-dimension pooling layer to the network.
     //!
     //! \param input The input tensor to the layer.
+    //! \param type The type of pooling to apply.
+    //! \param windowSize The size of the pooling window.
     //!
-    //! \see IIdentityLayer
+    //! \see IPoolingLayer PoolingType
     //!
-    //! \return The new identity layer, or nullptr if it could not be created.
+    //! \warning Int32 tensors are not valid input tensors.
+    //! \warning Only 2D or 3D pooling is supported.
+    //!
+    //! \return The new pooling layer, or nullptr if it could not be created.
     //!
-    virtual IIdentityLayer* addIdentity(ITensor& input) = 0;
+    virtual IPoolingLayer* addPoolingNd(ITensor& input, PoolingType type, Dims windowSize) TRTNOEXCEPT = 0;
 
     //!
-    //! \brief remove a tensor from the network definition.
+    //! \brief Add a multi-dimension deconvolution layer to the network.
     //!
-    //! \param tensor the tensor to remove
+    //! \param input The input tensor to the layer.
+    //! \param nbOutputMaps The number of output feature maps.
+    //! \param kernelSize The multi-dimensions of the deconvolution kernel.
+    //! \param kernelWeights The kernel weights for the deconvolution.
+    //! \param biasWeights The optional bias weights for the deconvolution.
     //!
-    //! It is illegal to remove a tensor that is the input or output of a layer.
-    //! if this method is called with such a tensor, a warning will be emitted on the log
-    //! and the call will be ignored. Its intended use is to remove detached tensors after
-    //! e.g. concatenating two networks with Layer::setInput().
+    //! \see IDeconvolutionLayer
+    //!
+    //! \warning It is an error to specify a wildcard value for the 'C' dimension of the input tensor.
+    //! \warning Int32 tensors are not valid input tensors.
+    //! \warning Only 2D or 3D deconvolution is supported.
+    //
+    //! \return The new deconvolution layer, or nullptr if it could not be created.
     //!
-    virtual void removeTensor(ITensor& tensor) = 0;
+    virtual IDeconvolutionLayer* addDeconvolutionNd(
+        ITensor& input, int nbOutputMaps, Dims kernelSize, Weights kernelWeights, Weights biasWeights) TRTNOEXCEPT = 0;
 
     //!
-    //! \brief unmark a tensor as a network output.
+    //! \brief Add a multi-dimension scale layer to the network.
     //!
-    //! \param tensor The tensor to unmark as an output tensor.
+    //! \param input The input tensor to the layer.
+    //! \param mode The scaling mode.
+    //! \param shift The shift value.
+    //! \param scale The scale value.
+    //! \param power The power value.
+    //! \param channelAxis The channel axis.
     //!
-    //! see markOutput()
+    //! If the weights are available, then the size of weights are dependent on the ScaleMode.
+    //! For ::kUNIFORM, the number of weights is equal to 1.
+    //! For ::kCHANNEL, the number of weights is equal to the channel dimension.
+    //! For ::kELEMENTWISE, the number of weights is equal to the volume of the input.
+    //!
+    //! \see IScaleLayer
+    //! \warning Int32 tensors are not valid input tensors.
+    //! \warning Only 2D or 3D scale is supported.
+    //!
+    //! \return The new Scale layer, or nullptr if it could not be created.
     //!
-    virtual void unmarkOutput(ITensor& tensor) = 0;
+    virtual IScaleLayer* addScaleNd(ITensor& input, ScaleMode mode, Weights shift, Weights scale, Weights power, int channelAxis) TRTNOEXCEPT = 0;
 
+    //! \brief Add a resize layer to the network.
     //!
-    //! \brief Add a plugin layer to the network using the IPluginV2 interface.
+    //! \param input The input tensor to the layer.
     //!
-    //! \param inputs The input tensors to the layer.
-    //! \param nbInputs The number of input tensors.
-    //! \param plugin The layer plugin.
+    //! \see IResizeLayer
     //!
-    //! \see IPluginV2Layer
+    //! \warning Int32 tensors are not valid input tensors.
     //!
-    //! \return The new plugin layer, or nullptr if it could not be created.
+    //! \return The new resize layer, or nullptr if it could not be created.
     //!
-    virtual IPluginV2Layer* addPluginV2(ITensor* const* inputs, int nbInputs, IPluginV2& plugin) = 0;
+    virtual IResizeLayer* addResize(ITensor& input) TRTNOEXCEPT = 0;
 
     //!
-    //! \brief Add a slice layer to the network.
-    //!
-    //! \param input The input tensor to the layer.
-    //! \param start The start offset
-    //! \param size The output dimension
-    //! \param stride The slicing stride
+    //! \brief True if network is an explicit precision network
     //!
-    //! Positive, negative, zero stride values, and combinations of them in different dimensions are allowed.
+    //! hasExplicitPrecision() is true if and only if this INetworkDefinition
+    //! was created with createNetworkV2() with NetworkDefinitionCreationFlag::kEXPLICIT_PRECISION set.
     //!
-    //! \see ISliceLayer
+    //! \see createNetworkV2
     //!
-    //! \return The new slice layer, or nullptr if it could not be created.
+    //! \return True if network has explicit precision, false otherwise.
     //!
-    virtual ISliceLayer* addSlice(ITensor& input, Dims start, Dims size, Dims stride) = 0;
+    virtual bool hasExplicitPrecision() const TRTNOEXCEPT = 0;
 };
 
 //!
-//! \class IProfiler
-//!
-//! \brief Application-implemented interface for profiling.
-//!
-//! When this class is added to an execution context, the profiler will be called once per layer for each invocation of execute().
-//! Note that enqueue() does not currently support profiling.
+//! enum CalibrationAlgoType
 //!
-//! The profiler will only be called after execution is complete. It has a small impact on execution time.
+//! \brief Version of calibration algorithm to use.
 //!
-class IProfiler
+enum class CalibrationAlgoType : int
 {
-public:
-    //!
-    //! \brief Layer time reporting callback.
-    //!
-    //! \param layerName The name of the layer, set when constructing the network definition.
-    //! \param ms The time in milliseconds to execute the layer.
-    //!
-    virtual void reportLayerTime(const char* layerName, float ms) = 0;
-
-    virtual ~IProfiler() {}
+    kLEGACY_CALIBRATION = 0,
+    kENTROPY_CALIBRATION = 1,
+    kENTROPY_CALIBRATION_2 = 2,
+    kMINMAX_CALIBRATION = 3,
 };
 
-class ICudaEngine;
+template <>
+constexpr inline int EnumMax<CalibrationAlgoType>()
+{
+    return 4;
+} //!< Maximum number of elements in CalibrationAlgoType enum. \see DataType
 
 //!
-//! \class IExecutionContext
+//! \class IInt8Calibrator
 //!
-//! \brief Context for executing inference using an engine.
+//! \brief Application-implemented interface for calibration.
 //!
-//! Multiple execution contexts may exist for one ICudaEngine instance, allowing the same
-//! engine to be used for the execution of multiple batches simultaneously.
+//! Calibration is a step performed by the builder when deciding suitable scale factors for 8-bit inference.
 //!
-//! \warning Do not inherit from this class, as doing so will break forward-compatibility of the API and ABI.
+//! It must also provide a method for retrieving representative images which the calibration process can use to examine
+//! the distribution of activations. It may optionally implement a method for caching the calibration result for reuse
+//! on subsequent runs.
 //!
-class IExecutionContext
+class IInt8Calibrator
 {
 public:
     //!
-    //! \brief Synchronously execute inference on a batch.
-    //!
-    //! This method requires an array of input and output buffers. The mapping from tensor names to indices can be queried using ICudaEngine::getBindingIndex()
-    //! \param batchSize The batch size. This is at most the value supplied when the engine was built.
-    //! \param bindings An array of pointers to input and output buffers for the network.
-    //!
-    //! \return True if execution succeeded.
+    //! \brief Get the batch size used for calibration batches.
     //!
-    //! \see ICudaEngine::getBindingIndex() ICudaEngine::getMaxBatchSize()
+    //! \return The batch size.
     //!
-    virtual bool execute(int batchSize, void** bindings) = 0;
+    virtual int getBatchSize() const TRTNOEXCEPT = 0;
 
     //!
-    //! \brief Asynchronously execute inference on a batch.
+    //! \brief Get a batch of input for calibration.
     //!
-    //! This method requires an array of input and output buffers. The mapping from tensor names to indices can be queried using ICudaEngine::getBindingIndex()
-    //! \param batchSize The batch size. This is at most the value supplied when the engine was built.
-    //! \param bindings An array of pointers to input and output buffers for the network.
-    //! \param stream A cuda stream on which the inference kernels will be enqueued
-    //! \param inputConsumed An optional event which will be signaled when the input buffers can be refilled with new data
+    //! The batch size of the input must match the batch size returned by getBatchSize().
     //!
-    //! \return True if the kernels were enqueued successfully.
+    //! \param bindings An array of pointers to device memory that must be updated to point to device memory
+    //! containing each network input data.
+    //! \param names The names of the network input for each pointer in the binding array.
+    //! \param nbBindings The number of pointers in the bindings array.
+    //! \return False if there are no more batches for calibration.
     //!
-    //! \see ICudaEngine::getBindingIndex() ICudaEngine::getMaxBatchSize()
+    //! \see getBatchSize()
     //!
-    virtual bool enqueue(int batchSize, void** bindings, cudaStream_t stream, cudaEvent_t* inputConsumed) = 0;
+    virtual bool getBatch(void* bindings[], const char* names[], int nbBindings) TRTNOEXCEPT = 0;
 
     //!
-    //! \brief Set the debug sync flag.
+    //! \brief Load a calibration cache.
+    //!
+    //! Calibration is potentially expensive, so it can be useful to generate the calibration data once, then use it on
+    //! subsequent builds of the network. The cache includes the regression cutoff and quantile values used to generate
+    //! it, and will not be used if these do not batch the settings of the current calibrator. However, the network
+    //! should also be recalibrated if its structure changes, or the input data set changes, and it is the
+    //! responsibility of the application to ensure this.
     //!
-    //! If this flag is set to true, the engine will log the successful execution for each kernel during execute(). It has no effect when using enqueue().
+    //! \param length The length of the cached data, that should be set by the called function. If there is no data,
+    //! this should be zero.
     //!
-    //! \see getDebugSync()
+    //! \return A pointer to the cache, or nullptr if there is no data.
     //!
-    virtual void setDebugSync(bool sync) = 0;
+    virtual const void* readCalibrationCache(std::size_t& length) TRTNOEXCEPT = 0;
 
     //!
-    //! \brief Get the debug sync flag.
+    //! \brief Save a calibration cache.
     //!
-    //! \see setDebugSync()
+    //! \param ptr A pointer to the data to cache.
+    //! \param length The length in bytes of the data to cache.
     //!
-    virtual bool getDebugSync() const = 0;
+    //! \see readCalibrationCache()
+    //!
+    virtual void writeCalibrationCache(const void* ptr, std::size_t length) TRTNOEXCEPT = 0;
 
     //!
-    //! \brief Set the profiler.
+    //! \brief Get the algorithm used by this calibrator.
     //!
-    //! \see IProfiler getProfiler()
+    //! \return The algorithm used by the calibrator.
     //!
-    virtual void setProfiler(IProfiler*) = 0;
+    virtual CalibrationAlgoType getAlgorithm() TRTNOEXCEPT = 0;
 
+    virtual ~IInt8Calibrator() {}
+};
+
+//!
+//! Entropy calibrator. This is the Legacy Entropy calibrator. It is less complicated than the legacy calibrator and
+//! produces better results.
+//!
+class IInt8EntropyCalibrator : public IInt8Calibrator
+{
+public:
     //!
-    //! \brief Get the profiler.
-    //!
-    //! \see IProfiler setProfiler()
+    //! Signal that this is the entropy calibrator.
     //!
-    virtual IProfiler* getProfiler() const = 0;
+    CalibrationAlgoType getAlgorithm() TRTNOEXCEPT override { return CalibrationAlgoType::kENTROPY_CALIBRATION; }
 
+    virtual ~IInt8EntropyCalibrator() {}
+};
+
+//!
+//! Entropy calibrator 2. This is the preferred calibrator. This is the required calibrator for DLA, as it supports per
+//! activation tensor scaling.
+//!
+class IInt8EntropyCalibrator2 : public IInt8Calibrator
+{
+public:
     //!
-    //! \brief Get the associated engine.
-    //!
-    //! \see ICudaEngine
+    //! Signal that this is the entropy calibrator 2.
     //!
-    virtual const ICudaEngine& getEngine() const = 0;
+    CalibrationAlgoType getAlgorithm() TRTNOEXCEPT override { return CalibrationAlgoType::kENTROPY_CALIBRATION_2; }
 
+    virtual ~IInt8EntropyCalibrator2() {}
+};
+
+//!
+//! MinMax Calibrator. This is the preferred calibrator for NLP tasks. It supports per
+//! activation tensor scaling.
+//!
+class IInt8MinMaxCalibrator : public IInt8Calibrator
+{
+public:
     //!
-    //! \brief Destroy this object.
+    //! Signal that this is the MinMax Calibrator.
     //!
-    virtual void destroy() = 0;
+    CalibrationAlgoType getAlgorithm() TRTNOEXCEPT override { return CalibrationAlgoType::kMINMAX_CALIBRATION; }
 
-protected:
-    virtual ~IExecutionContext() {}
+    virtual ~IInt8MinMaxCalibrator() {}
+};
 
+//!
+//! \deprecated Legacy calibrator left for backward compatibility with TensorRT 2.0.
+//!
+class TRT_DEPRECATED IInt8LegacyCalibrator : public IInt8Calibrator
+{
 public:
     //!
-    //! \brief Set the name of the execution context.
+    //! Signal that this is the legacy calibrator.
     //!
-    //! This method copies the name string.
+    CalibrationAlgoType getAlgorithm() TRTNOEXCEPT override { return CalibrationAlgoType::kLEGACY_CALIBRATION; }
+
     //!
-    //! \see getName()
+    //! \brief The quantile (between 0 and 1) that will be used to select the region maximum when the quantile method
+    //! is in use.
+    //!
+    //! See the user guide for more details on how the quantile is used.
     //!
-    virtual void setName(const char* name) = 0;
+    virtual double getQuantile() const TRTNOEXCEPT = 0;
 
     //!
-    //! \brief Return the name of the execution context.
+    //! \brief The fraction (between 0 and 1) of the maximum used to define the regression cutoff when using regression
+    //! to determine the region maximum.
     //!
-    //! \see setName()
+    //! See the user guide for more details on how the regression cutoff is used
+    //!
+    virtual double getRegressionCutoff() const TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Load a histogram.
+    //!
+    //! Histogram generation is potentially expensive, so it can be useful to generate the histograms once, then use
+    //! them when exploring the space of calibrations. The histograms should be regenerated if the network structure
+    //! changes, or the input data set changes, and it is the responsibility of the application to ensure this.
+    //!
+    //! \param length The length of the cached data, that should be set by the called function. If there is no data,
+    //! this should be zero.
     //!
-    virtual const char* getName() const = 0;
+    //! \return A pointer to the cache, or nullptr if there is no data.
+    //!
+    virtual const void* readHistogramCache(std::size_t& length) TRTNOEXCEPT = 0;
 
     //!
-    //! \brief set the device memory for use by this execution context.
+    //! \brief Save a histogram cache.
     //!
-    //! The memory must be aligned with cuda memory alignment property (using cudaGetDeviceProperties()), and its size must be at least that
-    //! returned by getDeviceMemorySize(). If using enqueue() to run the network, The memory is in
-    //! use from the invocation of enqueue() until network execution is complete. If using execute(),
-    //! it is in use until execute() returns. Releasing or otherwise using the memory for other
-    //! purposes during this time will result in undefined behavior.
+    //! \param ptr A pointer to the data to cache.
+    //! \param length The length in bytes of the data to cache.
     //!
-    //! \see ICudaEngine::getDeviceMemorySize() ICudaEngine::createExecutionContextWithoutDeviceMemory()
+    //! \see readHistogramCache()
     //!
-    virtual void setDeviceMemory(void* memory) = 0;
+    virtual void writeHistogramCache(const void* ptr, std::size_t length) TRTNOEXCEPT = 0;
+
+    virtual ~IInt8LegacyCalibrator() {}
 };
 
 //!
-//! \class ICudaEngine
+//! \brief It is capable of representing one or more BuilderFlags by binary OR
+//! operations, e.g., 1U << BuilderFlag::kFP16 | 1U << BuilderFlag::kDEBUG.
 //!
-//! \brief An engine for executing inference on a built network.
+//! \see IBuilderConfig::getFlags(), ITensor::setFlags(),
 //!
-//! \warning Do not inherit from this class, as doing so will break forward-compatibility of the API and ABI.
+typedef uint32_t BuilderFlags;
+
+//!
+//! \enum BuilderFlags
+//!
+//! \brief List of valid modes that the builder can enable when creating an engine from a network definition.
 //!
-class ICudaEngine
+//! \see IBuilderConfig::setFlag(), IBuilderConfig::getFlag()
+//!
+enum class BuilderFlag : int
+{
+    kFP16 = 0,         //!< Enable FP16 layer selection.
+    kINT8 = 1,         //!< Enable Int8 layer selection.
+    kDEBUG = 2,        //!< Enable debugging of layers via synchronizing after every layer.
+    kGPU_FALLBACK = 3, //!< Enable layers marked to execute on GPU if layer cannot execute on DLA.
+    kSTRICT_TYPES = 4, //!< Enables strict type constraints.
+    kREFIT = 5,        //!< Enable building a refittable engine.
+};
+
+template <>
+constexpr inline int EnumMax<BuilderFlag>()
+{
+    return 6;
+} //!< Maximum number of builder flags in BuilderFlag enum. \see BuilderFlag
+
+//!
+//! \class IBuilderConfig
+//!
+//! \brief Holds properties for configuring a builder to produce an engine. \see BuilderFlags
+//!
+class IBuilderConfig
 {
 public:
     //!
-    //! \brief Get the number of binding indices.
-    //!
-    //! \see getBindingIndex();
+    //! \brief Set the number of minimization iterations used when timing layers.
     //!
-    virtual int getNbBindings() const = 0;
-
+    //! When timing layers, the builder minimizes over a set of average times for layer execution. This parameter
+    //! controls the number of iterations used in minimization. The builder may sometimes run layers for more
+    //! iterations to improve timing accuracy if this parameter is set to a small value and the runtime of the
+    //! layer is short.
     //!
-    //! \brief Retrieve the binding index for a named tensor.
+    //! \see getMinTimingIterations()
     //!
-    //! IExecutionContext::enqueue() and IExecutionContext::execute() require an array of buffers.
+    virtual void setMinTimingIterations(int minTiming) TRTNOEXCEPT = 0;
+
     //!
-    //! Engine bindings map from tensor names to indices in this array.
-    //! Binding indices are assigned at engine build time, and take values in the range [0 ... n-1] where n is the total number of inputs and outputs.
+    //! \brief Query the number of minimization iterations.
     //!
-    //! \param name The tensor name.
-    //! \return The binding index for the named tensor, or -1 if the name is not found.
+    //! By default the minimum number of iterations is 2.
     //!
-    //! see getNbBindings() getBindingIndex()
+    //! \see setMinTimingIterations()
     //!
-    virtual int getBindingIndex(const char* name) const = 0;
+    virtual int getMinTimingIterations() const TRTNOEXCEPT = 0;
 
     //!
-    //! \brief Retrieve the name corresponding to a binding index.
-    //!
-    //! This is the reverse mapping to that provided by getBindingIndex().
+    //! \brief Set the number of averaging iterations used when timing layers.
     //!
-    //! \param bindingIndex The binding index.
-    //! \return The name corresponding to the index, or nullptr if the index is out of range.
+    //! When timing layers, the builder minimizes over a set of average times for layer execution. This parameter
+    //! controls the number of iterations used in averaging.
     //!
-    //! \see getBindingIndex()
+    //! \see getAvgTimingIterations()
     //!
-    virtual const char* getBindingName(int bindingIndex) const = 0;
+    virtual void setAvgTimingIterations(int avgTiming) TRTNOEXCEPT = 0;
 
     //!
-    //! \brief Determine whether a binding is an input binding.
+    //! \brief Query the number of averaging iterations.
     //!
-    //! \param bindingIndex The binding index.
-    //! \return True if the index corresponds to an input binding and the index is in range.
+    //! By default the number of averaging iterations is 1.
     //!
-    //! \see getBindingIndex()
+    //! \see setAvgTimingIterations()
     //!
-    virtual bool bindingIsInput(int bindingIndex) const = 0;
+    virtual int getAvgTimingIterations() const TRTNOEXCEPT = 0;
 
     //!
-    //! \brief Get the dimensions of a binding.
+    //! \brief Configure the builder to target specified EngineCapability flow.
     //!
-    //! \param bindingIndex The binding index.
-    //! \return The dimensions of the binding if the index is in range, otherwise (0,0,0).
+    //! The flow means a sequence of API calls that allow an application to set up a runtime, engine,
+    //! and execution context in order to run inference.
     //!
-    //! \see getBindingIndex()
+    //! The supported flows are specified in the EngineCapability enum.
     //!
-    virtual Dims getBindingDimensions(int bindingIndex) const = 0;
+    virtual void setEngineCapability(EngineCapability capability) TRTNOEXCEPT = 0;
 
     //!
-    //! \brief Determine the required data type for a buffer from its binding index.
+    //! \brief Query EngineCapability flow configured for the builder.
     //!
-    //! \param bindingIndex The binding index.
-    //! \return The type of the data in the buffer.
+    //! By default it returns EngineCapability::kDEFAULT.
     //!
-    //! \see getBindingIndex()
+    //! \see setEngineCapability()
     //!
-    virtual DataType getBindingDataType(int bindingIndex) const = 0;
+    virtual EngineCapability getEngineCapability() const TRTNOEXCEPT = 0;
 
     //!
-    //! \brief Get the maximum batch size which can be used for inference.
+    //! \brief Set Int8 Calibration interface.
     //!
-    //! \return The maximum batch size for this engine.
+    //! The calibrator is to minimize the information loss during the INT8 quantization process.
     //!
-    virtual int getMaxBatchSize() const = 0;
+    virtual void setInt8Calibrator(IInt8Calibrator* calibrator) TRTNOEXCEPT = 0;
 
     //!
-    //! \brief Get the number of layers in the network.
-    //!
-    //! The number of layers in the network is not necessarily the number in the original network definition, as layers may be combined or eliminated as the engine is
-    //! optimized. This value can be useful when building per-layer tables, such as when aggregating profiling data over a number of executions.
-    //!
-    //! \return The number of layers in the network.
+    //! \brief Get Int8 Calibration interface.
     //!
-    virtual int getNbLayers() const = 0;
+    virtual IInt8Calibrator* getInt8Calibrator() const TRTNOEXCEPT = 0;
 
     //!
-    //! \brief Get the amount of workspace the engine uses.
+    //! \brief Set the maximum workspace size.
+    //!
+    //! \param workspaceSize The maximum GPU temporary memory which the engine can use at execution time.
     //!
-    //! The workspace size will be no greater than the value provided to the builder when the engine was built, and will typically be smaller.
-    //! Workspace will be allocated for each execution context.
+    //! \see getMaxWorkspaceSize()
     //!
-    virtual std::size_t getWorkspaceSize() const = 0;
+    virtual void setMaxWorkspaceSize(std::size_t workspaceSize) TRTNOEXCEPT = 0;
 
     //!
-    //! \brief Serialize the network to a stream.
+    //! \brief Get the maximum workspace size.
     //!
-    //! \return A IHostMemory object that contains the serialized engine.
+    //! By default the workspace size is 0, which means there is no temporary memory.
     //!
-    //! The network may be deserialized with IRuntime::deserializeCudaEngine()
+    //! \return The maximum workspace size.
     //!
-    //! \see IRuntime::deserializeCudaEngine()
+    //! \see setMaxWorkspaceSize()
     //!
-    virtual IHostMemory* serialize() const = 0;
+    virtual std::size_t getMaxWorkspaceSize() const TRTNOEXCEPT = 0;
 
     //!
-    //! \brief Create an execution context.
+    //! \brief Set the build mode flags to turn on builder options for this network.
     //!
-    //! \see IExecutionContext.
+    //! The flags are listed in the BuilderFlags enum.
+    //! The flags set configuration options to build the network.
     //!
-    virtual IExecutionContext* createExecutionContext() = 0;
-
+    //! \param builderFlags The build option for an engine.
     //!
-    //! \brief Destroy this object;
+    //! \note This function will override the previous set flags, rather than bitwise adding the new flag.
     //!
-    virtual void destroy() = 0;
-
+    //! \see getFlags()
     //!
-    //! \brief Get location of binding
+    virtual void setFlags(BuilderFlags builderFlags) TRTNOEXCEPT = 0;
+
     //!
-    //! This lets you know whether the binding should be a pointer to device or host memory.
+    //! \brief Get the set of build mode flags for this builder config. Defaults to BuildMode::kDEFAULT.
     //!
-    //! \see ITensor::setLocation() ITensor::getLocation()
+    //! \return The build options as a bitmask.
     //!
-    //! \param bindingIndex The binding index.
-    //! \return The location of the bound tensor with given index.
+    //! \see setFlags()
     //!
-    virtual TensorLocation getLocation(int bindingIndex) const = 0;
-
-protected:
-    virtual ~ICudaEngine() {}
+    virtual BuilderFlags getFlags() const TRTNOEXCEPT = 0;
 
-public:
     //!
-    //! \brief create an execution context without any device memory allocated
+    //! \brief clear a single build mode flag.
     //!
-    //! The memory for execution of this device context must be supplied by the application.
+    //! clears the builder flag from the set of enabled flags.
     //!
-    //! \see getDeviceMemorySize() IExecutionContext::setDeviceMemory()
+    //! \see setFlags
     //!
-    virtual IExecutionContext* createExecutionContextWithoutDeviceMemory() = 0;
+    virtual void clearFlag(BuilderFlag builderFlag) TRTNOEXCEPT = 0;
 
     //!
-    //! \brief Return the amount of device memory required by an execution context.
+    //! \brief Set a single build mode flag.
     //!
-    //! \see IExecutionContext::setDeviceMemory()
+    //! Sets the build mode flags on top of the flags already specified.
     //!
-    virtual size_t getDeviceMemorySize() const = 0;
+    //! \see setFlags
+    //!
+    virtual void setFlag(BuilderFlag builderFlag) TRTNOEXCEPT = 0;
 
     //!
-    //! \brief Return true if engine can be refit.
+    //! \brief returns true if the build mode flag is set
     //!
-    //! \see nvinfer1::createInferRefitter()
+    //! Check if a build mode flag is set.
     //!
-    virtual bool isRefittable() const = 0;
-};
-
-//!
-//! enum CalibrationAlgoType
-//!
-//! \brief Version of calibration algorithm to use.
-//!
-enum class CalibrationAlgoType : int
-{
-    kLEGACY_CALIBRATION = 0,
-    kENTROPY_CALIBRATION = 1,
-    kENTROPY_CALIBRATION_2 = 2
-};
+    //! \see getFlags()
+    //!
+    //! \return True if flag is set, false if unset.
+    //!
+    virtual bool getFlag(BuilderFlag builderFlag) const TRTNOEXCEPT = 0;
 
-template <>
-inline int EnumMax<CalibrationAlgoType>()
-{
-    return 3;
-} //!< Maximum number of elements in CalibrationAlgoType enum. \see DataType
 
-//!
-//! \class IInt8Calibrator
-//!
-//! \brief Application-implemented interface for calibration.
-//!
-//! Calibration is a step performed by the builder when deciding suitable scale factors for 8-bit inference.
-//!
-//! It must also provide a method for retrieving representative images which the calibration process can use to examine
-//! the distribution of activations. It may optionally implement a method for caching the calibration result for reuse
-//! on subsequent runs.
-//!
-class IInt8Calibrator
-{
-public:
     //!
-    //! \brief Get the batch size used for calibration batches.
+    //! \brief Set the device that this layer must execute on.
+    //! \param DeviceType that this layer must execute on.
+    //! If DeviceType is not set or is reset, TensorRT will use the default DeviceType set in the builder.
+    //!
+    //! \note The device type for a layer must be compatible with the safety flow (if specified).
+    //! For example a layer cannot be marked for DLA execution while the builder is configured for kSAFE_GPU.
     //!
-    //! \return The batch size.
+    //! \see getDeviceType()
     //!
-    virtual int getBatchSize() const = 0;
+    virtual void setDeviceType(const ILayer* layer, DeviceType deviceType) TRTNOEXCEPT = 0;
 
     //!
-    //! \brief Get a batch of input for calibration.
-    //!
-    //! The batch size of the input must match the batch size returned by getBatchSize().
-    //!
-    //! \param bindings An array of pointers to device memory that must be updated to point to device memory containing each network input data.
-    //! \param names The names of the network input for each pointer in the binding array.
-    //! \param nbBindings The number of pointers in the bindings array.
-    //! \return False if there are no more batches for calibration.
+    //! \brief Get the device that this layer executes on.
+    //! \return Returns DeviceType of the layer.
     //!
+    virtual DeviceType getDeviceType(const ILayer* layer) const TRTNOEXCEPT = 0;
+
     //!
-    //! \see getBatchSize()
+    //! \brief whether the DeviceType has been explicitly set for this layer
+    //! \return true if device type is not default
+    //! \see setDeviceType() getDeviceType() resetDeviceType()
     //!
-    virtual bool getBatch(void* bindings[], const char* names[], int nbBindings) = 0; // get a pointer to the input batch
+    virtual bool isDeviceTypeSet(const ILayer* layer) const TRTNOEXCEPT = 0;
 
     //!
-    //! \brief Load a calibration cache.
+    //! \brief reset the DeviceType for this layer
     //!
-    //! Calibration is potentially expensive, so it can be useful to generate the calibration data once, then use it on subsequent builds
-    //! of the network. The cache includes the regression cutoff and quantile values used to generate it, and will not be used if
-    //! these do not batch the settings of the current calibrator. However, the network should also be recalibrated if its structure
-    //! changes, or the input data set changes, and it is the responsibility of the application to ensure this.
+    //! \see setDeviceType() getDeviceType() isDeviceTypeSet()
     //!
-    //! \param length The length of the cached data, that should be set by the called function. If there is no data, this should be zero.
+    virtual void resetDeviceType(const ILayer* layer) TRTNOEXCEPT = 0;
+
     //!
-    //! \return A pointer to the cache, or nullptr if there is no data.
+    //! \brief Checks if a layer can run on DLA.
+    //! \return status true if the layer can on DLA else returns false.
     //!
-    virtual const void* readCalibrationCache(std::size_t& length) = 0;
+    virtual bool canRunOnDLA(const ILayer* layer) const TRTNOEXCEPT = 0;
 
     //!
-    //! \brief Save a calibration cache.
+    //! \brief Sets the DLA core used by the network.
+    //! \param dlaCore The DLA core to execute the engine on (0 to N-1). Default value is 0.
     //!
-    //! \param ptr A pointer to the data to cache.
-    //! \param length The length in bytes of the data to cache.
+    //! It can be used to specify which DLA core to use via indexing, if multiple DLA cores are available.
     //!
-    //! \see readCalibrationCache()
+    //! \see IRuntime::setDLACore() getDLACore()
     //!
-    virtual void writeCalibrationCache(const void* ptr, std::size_t length) = 0;
+    virtual void setDLACore(int dlaCore) TRTNOEXCEPT = 0;
 
     //!
-    //! \brief Get the algorithm used by this calibrator.
-    //!
-    //! \return The algorithm used by the calibrator.
+    //! \brief Get the DLA core that the engine executes on.
+    //! \return If setDLACore is called, returns DLA core from 0 to N-1, else returns 0.
     //!
-    virtual CalibrationAlgoType getAlgorithm() = 0;
-
-    virtual ~IInt8Calibrator() {}
-};
+    virtual int getDLACore() const TRTNOEXCEPT = 0;
 
-//!
-//! Entropy calibrator. This is the Legacy Entropy calibrator. It is less complicated than the legacy calibrator and produces better results.
-//!
-class IInt8EntropyCalibrator : public IInt8Calibrator
-{
-public:
     //!
-    //! Signal that this is the entropy calibrator.
+    //! \brief Sets the default DeviceType to be used by the builder. It ensures that all the layers that can run on
+    //! this device will run on it, unless setDeviceType is used to override the default DeviceType for a layer.
+    //! \see getDefaultDeviceType()
     //!
-    virtual CalibrationAlgoType getAlgorithm() { return CalibrationAlgoType::kENTROPY_CALIBRATION; }
-
-    virtual ~IInt8EntropyCalibrator() {}
-};
+    virtual void setDefaultDeviceType(DeviceType deviceType) TRTNOEXCEPT = 0;
 
-//!
-//! Entropy calibrator 2. This is the preferred calibrator. This is the required calibrator for DLA, as it supports per activation tensor scaling.
-//!
-class IInt8EntropyCalibrator2 : public IInt8Calibrator
-{
-public:
     //!
-    //! Signal that this is the entropy calibrator 2.
+    //! \brief Get the default DeviceType which was set by setDefaultDeviceType.
     //!
-    CalibrationAlgoType getAlgorithm() override { return CalibrationAlgoType::kENTROPY_CALIBRATION_2; }
-
-    virtual ~IInt8EntropyCalibrator2() {}
-};
+    //! By default it returns DeviceType::kGPU.
+    //!
+    virtual DeviceType getDefaultDeviceType() const TRTNOEXCEPT = 0;
 
-//!
-//! Legacy calibrator for compatibility with 2.0 EA. Will be removed in 2.2.
-//! \deprecated
-//!
-class IInt8LegacyCalibrator : public IInt8Calibrator
-{
-public:
     //!
-    //! Signal that this is the legacy calibrator.
+    //! \brief Resets the builder configuration to defaults.
     //!
-    virtual CalibrationAlgoType getAlgorithm() { return CalibrationAlgoType::kLEGACY_CALIBRATION; }
+    //! When initializing a builder config object, we can call this function.
+    //!
+    virtual void reset() TRTNOEXCEPT = 0;
 
     //!
-    //! \brief The quantile (between 0 and 1) that will be used to select the region maximum when the quantile method is in use.
+    //! \brief De-allocates any internally allocated memory.
     //!
-    //! See the user guide for more details on how the quantile is used.
+    //! When destroying a builder config object, we can call this function.
     //!
-    virtual double getQuantile() const = 0;
+    virtual void destroy() TRTNOEXCEPT = 0;
 
     //!
-    //! \brief The fraction (between 0 and 1) of the maximum used to define the regression cutoff when using regression to determine the region maximum.
+    //! \brief Set the cudaStream that is used to profile this network.
     //!
-    //! See the user guide for more details on how the regression cutoff is used
+    //! \param stream The cuda stream used for profiling by the builder.
     //!
-    virtual double getRegressionCutoff() const = 0;
-
+    //! \see getProfileStream()
     //!
-    //! \brief Load a histogram.
+    virtual void setProfileStream(const cudaStream_t stream) TRTNOEXCEPT = 0;
+
     //!
-    //! Histogram generation is potentially expensive, so it can be useful to generate the histograms once, then use them when exploring
-    //! the space of calibrations. The histograms should be regenerated if the network structure
-    //! changes, or the input data set changes, and it is the responsibility of the application to ensure this.
+    //! \brief Get the cudaStream that is used to profile this network.
     //!
-    //! \param length The length of the cached data, that should be set by the called function. If there is no data, this should be zero.
+    //! \return The cuda stream used for profiling by the builder.
     //!
-    //! \return A pointer to the cache, or nullptr if there is no data.
+    //! \see setProfileStream()
     //!
-    virtual const void* readHistogramCache(std::size_t& length) = 0;
+    virtual cudaStream_t getProfileStream() const TRTNOEXCEPT = 0;
 
     //!
-    //! \brief Save a histogram cache.
+    //! \brief Add an optimization profile.
     //!
-    //! \param ptr A pointer to the data to cache.
-    //! \param length The length in bytes of the data to cache.
+    //! This function must be called at least once if the network has dynamic or shape input tensors.
+    //! This function may be called at most once when building a refittable engine, as more than
+    //! a single optimization profile are not supported for refittable engines.
     //!
-    //! \see readHistogramCache()
+    //! \param profile The new optimization profile, which must satisfy profile->isValid() == true
+    //! \return The index of the optimization profile (starting from 0) if the input is valid, or -1 if the input is
+    //!         not valid.
     //!
-    virtual void writeHistogramCache(const void* ptr, std::size_t length) = 0;
+    virtual int addOptimizationProfile(const IOptimizationProfile* profile) noexcept = 0;
 
-    virtual ~IInt8LegacyCalibrator() {}
+    //!
+    //! \brief Get number of optimization profiles
+    //!
+    //! This is one higher than the index of the last optimization profile that has be defined (or
+    //! zero, if none has been defined yet).
+    //!
+    virtual int getNbOptimizationProfiles() const noexcept = 0;
+
+protected:
+    virtual ~IBuilderConfig()
+    {
+    }
 };
 
+//! \typedef NetworkDefinitionCreationFlags
 //!
-//! \enum EngineCapability
+//! \brief This bitset is capable of representing one or more NetworkDefinitionCreationFlag flags
+//! constructed with binary OR operations.
+//!  e.g., 1U << NetworkDefinitionCreationFlag::kEXPLICIT_BATCH
 //!
-//! \brief List of supported engine capability flows.
+//! \see IBuilder::createNetworkV2
 //!
-//! \note at present, kSAFE_DLA flow doesn't strictly limit execution to DLA/PVA devices - it simply
-//! restricts the engine capabilities to DLA/PVA support levels anticipated in future releases.
-//!
-enum class EngineCapability
-{
-    kDEFAULT = 0,   //!< Full capability, TensorRT mode without any restrictions.
-    kSAFE_GPU = 1,  //!< Safety restricted capability, TensorRT flow that can only run on GPU devices.
-    kSAFE_DLA = 2,  //!< Safety restricted capability, TensorRT flow that can only run on DLA/PVA devices.
-};
-
-template <>
-inline int EnumMax<EngineCapability>()
-{
-    return 3;
-} //!< Maximum number of elements in EngineCapability enum. \see EngineCapability
+typedef uint32_t NetworkDefinitionCreationFlags;
 
+//! \enum NetworkDefinitionCreationFlag
 //!
-//! \class IGpuAllocator
+//! \brief List of immutable network properties expressed at network creation time.
+//! NetworkDefinitionCreationFlag is used with createNetworkV2 to specify immutable properties of the network.
+//! The createNetwork() function always had an implicit batch dimension being specified by the
+//! maxBatchSize builder parameter. createNetworkV2 with kDEFAULT flag mimics that behaviour.
 //!
-//! \brief Application-implemented class for controlling allocation on the GPU.
+//! \see IBuilder::createNetworkV2
 //!
-class IGpuAllocator
+enum class NetworkDefinitionCreationFlag : int
 {
-public:
-    //!
-    //! A callback implemented by the application to handle acquisition of GPU memory.
-    //!
-    //! \param size The size of the memory required.
-    //! \param alignment The required alignment of memory. Alignment will zero
-    //!        or a power of 2 not exceeding the alignment guaranteed by cudaMalloc.
-    //!        Thus this allocator can be safely implemented with cudaMalloc/cudaFree.
-    //!        An alignment value of zero indicates any alignment is acceptable.
-    //! \param flags Reserved for future use. In the current release, 0 will be passed.
-    //!
-    //! If an allocation request of size 0 is made, nullptr should be returned.
-    //!
-    //! If an allocation request cannot be satisfied, nullptr should be returned.
-    //!
-    virtual void* allocate(uint64_t size, uint64_t alignment, uint32_t flags) = 0;
-
-    //!
-    //! A callback implemented by the application to handle release of GPU memory.
-    //!
-    //! TensorRT may pass a nullptr to this function if it was previously returned by allocate().
-    //!
-    //! \param memory The acquired memory.
-    //!
-    virtual void free(void* memory) = 0;
-
-    //!
-    //! Destructor declared virtual as general good practice for a class with virtual methods.
-    //! TensorRT never calls the destructor for an IGpuAllocator defined by the application.
-    //!
-    virtual ~IGpuAllocator() {}
+    //! Dynamic shape support requires that the kEXPLICIT_BATCH flag is set.
+    //! With dynamic shapes, any of the input dimensions can vary at run-time,
+    //! and there are no implicit dimensions in the network specification. This is specified by using the
+    //! wildcard dimension value -1.
+    kEXPLICIT_BATCH = 0, //!< Mark the network to be an explicit batch network
+
+    //! Setting the network to be an explicit precision network has the following implications:
+    //! 1) Precision of all input tensors to the network have to be specified with ITensor::setType() function
+    //! 2) Precision of all layer output tensors in the network have to be specified using ILayer::setOutputType()
+    //! function
+    //! 3) The builder will not quantize the weights of any layer including those running in lower precision(INT8). It
+    //! will
+    //! simply cast the weights into the required precision.
+    //! 4) Dynamic ranges must not be provided to run the network in int8 mode. Dynamic ranges of each tensor in the
+    //! explicit
+    //! precision network is [-127,127].
+    //! 5) Quantizing and dequantizing activation values between higher (FP32) and lower (INT8) precision
+    //! will be performed using explicit Scale layers with input/output precision set appropriately.
+    kEXPLICIT_PRECISION = 1, //!< Mark the network to be an explicit precision network
 };
+template <>
+constexpr inline int EnumMax<NetworkDefinitionCreationFlag>()
+{
+    return 2;
+}
 
 //!
 //! \class IBuilder
@@ -5289,20 +5391,27 @@ class IBuilder
 {
 public:
     //!
-    //! \brief Create a network definition object.
+    //! \brief Create a network definition object where all tensors have an implicit batch dimension.
+    //!
+    //! This method is equivalent to createNetworkV2(0U), and retained for
+    //! compatibility
+    //! with earlier version of TensorRT.  The network does not support dynamic shapes or explicit batch sizes.
     //!
-    //! \see INetworkDefinition
+    //! \see INetworkDefinition, createNetworkV2
     //!
-    virtual nvinfer1::INetworkDefinition* createNetwork() = 0;
+    //! \deprecated API will be removed in a future release, use IBuilder::createNetworkV2() instead.
+    //!
+    TRT_DEPRECATED virtual nvinfer1::INetworkDefinition* createNetwork() TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the maximum batch size.
     //!
-    //! \param batchSize The maximum batch size which can be used at execution time, and also the batch size for which the engine will be optimized.
+    //! \param batchSize The maximum batch size which can be used at execution time, and also the batch size for which
+    //! the engine will be optimized.
     //!
     //! \see getMaxBatchSize()
     //!
-    virtual void setMaxBatchSize(int batchSize) = 0;
+    virtual void setMaxBatchSize(int batchSize) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the maximum batch size.
@@ -5312,7 +5421,7 @@ class IBuilder
     //! \see setMaxBatchSize()
     //! \see getMaxDLABatchSize()
     //!
-    virtual int getMaxBatchSize() const = 0;
+    virtual int getMaxBatchSize() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the maximum workspace size.
@@ -5321,7 +5430,9 @@ class IBuilder
     //!
     //! \see getMaxWorkspaceSize()
     //!
-    virtual void setMaxWorkspaceSize(std::size_t workspaceSize) = 0;
+    //! \deprecated API will be removed in a future release, use IBuilderConfig::setMaxWorkspaceSize instead.
+    //!
+    TRT_DEPRECATED virtual void setMaxWorkspaceSize(std::size_t workspaceSize) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the maximum workspace size.
@@ -5330,118 +5441,147 @@ class IBuilder
     //!
     //! \see setMaxWorkspaceSize()
     //!
-    virtual std::size_t getMaxWorkspaceSize() const = 0;
+    //! \deprecated API will be removed in a future release, use IBuilderConfig::getMaxWorkspaceSize instead.
+    //!
+    TRT_DEPRECATED virtual std::size_t getMaxWorkspaceSize() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set whether half2 mode is used.
     //!
-    //! half2 mode is a paired-image mode that is significantly faster for batch sizes greater than one on platforms with fp16 support.
+    //! half2 mode is a paired-image mode that is significantly faster for batch sizes greater than one on platforms
+    //! with fp16 support.
     //!
     //! \param mode Whether half2 mode is used.
     //!
     //! \see getHalf2Mode()
     //!
-    //! \deprecated This function is superseded by setFp16Mode.
+    //! \deprecated API will be removed in a future release, use IBuilderConfig::setFlag instead.
     //!
-    virtual void setHalf2Mode(bool mode) = 0;
+    TRT_DEPRECATED virtual void setHalf2Mode(bool mode) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Query whether half2 mode is used.
     //!
     //! \see setHalf2Mode()
     //!
-    //! \deprecated This function is superseded by getFp16Mode.
+    //! \deprecated API will be removed in a future release, use IBuilderConfig::getFlag instead.
     //!
-    virtual bool getHalf2Mode() const = 0;
+    TRT_DEPRECATED virtual bool getHalf2Mode() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set whether the builder should use debug synchronization.
     //!
-    //! If this flag is true, the builder will synchronize after timing each layer, and report the layer name. It can be useful when diagnosing issues at build time.
+    //! If this flag is true, the builder will synchronize after timing each layer, and report the layer name. It can
+    //! be useful when diagnosing issues at build time.
+    //!
+    //! \deprecated API will be removed in a future release, use IBuilderConfig::setFlag instead.
     //!
-    virtual void setDebugSync(bool sync) = 0;
+    TRT_DEPRECATED virtual void setDebugSync(bool sync) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Query whether the builder will use debug synchronization.
     //!
     //! \see setDebugSync()
     //!
-    virtual bool getDebugSync() const = 0;
+    //! \deprecated API will be removed in a future release, use IBuilderConfig::getFlag instead.
+    //!
+    TRT_DEPRECATED virtual bool getDebugSync() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the number of minimization iterations used when timing layers.
     //!
-    //! When timing layers, the builder minimizes over a set of average times for layer execution. This parameter controls the number of iterations
-    //! used in minimization.
+    //! When timing layers, the builder minimizes over a set of average times for layer execution. This parameter
+    //! controls the number of iterations used in minimization.
     //!
     //! \see getMinFindIterations()
     //!
-    virtual void setMinFindIterations(int minFind) = 0;
+    //! \deprecated API will be removed in a future release, use IBuilderConfig::setMinTimingIterations instead.
+    //!
+    TRT_DEPRECATED virtual void setMinFindIterations(int minFind) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Query the number of minimization iterations.
     //!
     //! \see setMinFindIterations()
     //!
-    virtual int getMinFindIterations() const = 0;
+    //! \deprecated API will be removed in a future release, use IBuilderConfig::getMinTimingIterations instead.
+    //!
+    TRT_DEPRECATED virtual int getMinFindIterations() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the number of averaging iterations used when timing layers.
     //!
-    //! When timing layers, the builder minimizes over a set of average times for layer execution. This parameter controls the number of iterations
-    //! used in averaging.
+    //! When timing layers, the builder minimizes over a set of average times for layer execution. This parameter
+    //! controls the number of iterations used in averaging.
     //!
     //! \see getAverageFindIterations()
     //!
-    virtual void setAverageFindIterations(int avgFind) = 0;
+    //! \deprecated API will be removed in a future release, use IBuilderConfig::setAvgTimingIterations instead.
+    //!
+    TRT_DEPRECATED virtual void setAverageFindIterations(int avgFind) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Query the number of averaging iterations.
     //!
     //! \see setAverageFindIterations()
     //!
-    virtual int getAverageFindIterations() const = 0;
+    //! \deprecated API will be removed in a future release, use IBuilderConfig::getAvgTimingIterations instead.
+    //!
+    TRT_DEPRECATED virtual int getAverageFindIterations() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Build a CUDA engine from a network definition.
     //!
     //! \see INetworkDefinition ICudaEngine
     //!
-    virtual nvinfer1::ICudaEngine* buildCudaEngine(nvinfer1::INetworkDefinition& network) = 0;
+    //! \depercated API will be removed in a future release, use IBuilderConfig::buildEngineWithConfig instead.
+    //!
+    TRT_DEPRECATED virtual nvinfer1::ICudaEngine* buildCudaEngine(
+        nvinfer1::INetworkDefinition& network) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Determine whether the platform has fast native fp16.
     //!
-    virtual bool platformHasFastFp16() const = 0;
+    virtual bool platformHasFastFp16() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Determine whether the platform has fast native int8.
     //!
-    virtual bool platformHasFastInt8() const = 0;
+    virtual bool platformHasFastInt8() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Destroy this object.
     //!
-    virtual void destroy() = 0;
+    virtual void destroy() TRTNOEXCEPT = 0;
 
     //!
-    //! \brief Set the maximum value for a region.
+    //! \brief Set whether or not quantized 8-bit kernels are permitted.
+    //!
+    //! During engine build int8 kernels will also be tried when this mode is enabled.
+    //!
+    //! \param mode Whether quantized 8-bit kernels are permitted.
     //!
-    //! Used for INT8 mode compression.
+    //! \see getInt8Mode()
     //!
-    virtual void setInt8Mode(bool mode) = 0;
+    //! \deprecated API will be removed in a future release, use IBuilderConfig::setFlag instead.
+    //!
+    TRT_DEPRECATED virtual void setInt8Mode(bool mode) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Query whether Int8 mode is used.
     //!
     //! \see setInt8Mode()
     //!
-    virtual bool getInt8Mode() const = 0;
+    //! \deprecated API will be removed in a future release, use IBuilderConfig::getFlag instead.
+    //!
+    TRT_DEPRECATED virtual bool getInt8Mode() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set Int8 Calibration interface.
     //!
-    virtual void setInt8Calibrator(IInt8Calibrator* calibrator) = 0;
+    //! \deprecated API will be removed in a future release, use IBuilderConfig::setInt8Calibrator instead.
+    //!
+    TRT_DEPRECATED virtual void setInt8Calibrator(IInt8Calibrator* calibrator) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the device that this layer must execute on.
@@ -5453,50 +5593,66 @@ class IBuilder
     //!
     //! \see getDeviceType()
     //!
-    virtual void setDeviceType(ILayer* layer, DeviceType deviceType) = 0;
+    //! \deprecated API will be removed in a future release, use IBuilderConfig::setDeviceType instead.
+    //!
+    TRT_DEPRECATED virtual void setDeviceType(ILayer* layer, DeviceType deviceType) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the device that this layer executes on.
     //! \return Returns DeviceType of the layer.
     //!
-    virtual DeviceType getDeviceType(const ILayer* layer) const = 0;
+    //! \deprecated API will be removed in a future release, use IBuilderConfig::getDeviceType instead.
+    //!
+    TRT_DEPRECATED virtual DeviceType getDeviceType(const ILayer* layer) const TRTNOEXCEPT = 0;
 
     //!
     //! \brief whether the DeviceType has been explicitly set for this layer
     //! \return whether the DeviceType has been explicitly set
     //! \see setDeviceType() getDeviceType() resetDeviceType()
     //!
-    virtual bool isDeviceTypeSet(const ILayer* layer) const = 0;
+    //! \deprecated API will be removed in a future release, use IBuilderConfig::isDeviceTypeSet instead.
+    //!
+    TRT_DEPRECATED virtual bool isDeviceTypeSet(const ILayer* layer) const TRTNOEXCEPT = 0;
 
     //!
     //! \brief reset the DeviceType for this layer
     //!
     //! \see setDeviceType() getDeviceType() isDeviceTypeSet()
     //!
-    virtual void resetDeviceType(ILayer* layer) = 0;
+    //! \deprecated API will be removed in a future release, use IBuilderConfig::resetDeviceType instead.
+    //!
+    TRT_DEPRECATED virtual void resetDeviceType(ILayer* layer) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Checks if a layer can run on DLA.
     //! \return status true if the layer can on DLA else returns false.
     //!
-    virtual bool canRunOnDLA(const ILayer* layer) const = 0;
+    TRT_DEPRECATED virtual bool canRunOnDLA(const ILayer* layer) const TRTNOEXCEPT = 0;
 
     //!
-    //! \brief Sets the default DeviceType to be used by the builder. It ensures that all the layers that can run on this device will run on it, unless setDeviceType is used to override the default DeviceType for a layer.
+    //! \brief Sets the default DeviceType to be used by the builder. It ensures that all the layers that can run on
+    //! this device will run on it, unless setDeviceType is used to override the default DeviceType for a layer.
     //! \see getDefaultDeviceType()
     //!
-    virtual void setDefaultDeviceType(DeviceType deviceType) = 0;
+    //! \deprecated API will be removed in a future release, use IBuilderConfig::setDefaultDeviceType instead.
+    //!
+    TRT_DEPRECATED virtual void setDefaultDeviceType(DeviceType deviceType) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the default DeviceType which was set by setDefaultDeviceType.
     //!
-    virtual DeviceType getDefaultDeviceType() const = 0;
+    //! \deprecated API will be removed in a future release, use IBuilderConfig::getDefaultDeviceType instead.
+    //!
+    TRT_DEPRECATED virtual DeviceType getDefaultDeviceType() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the maximum batch size DLA can support.
-    //! For any tensor the total volume of index dimensions combined(dimensions other than CHW) with the requested batch size should not exceed the value returned by this function.
+    //! For any tensor the total volume of index dimensions combined(dimensions other than CHW) with the requested
+    //! batch size should not exceed the value returned by this function.
+    //!
+    //! \warning getMaxDLABatchSize does not work with dynamic shapes.
     //!
-    virtual int getMaxDLABatchSize() const = 0;
+    virtual int getMaxDLABatchSize() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Sets the builder to use GPU if a layer that was supposed to run on DLA can not run on DLA.
@@ -5505,46 +5661,60 @@ class IBuilder
     //! \note GPU fallback may only be specified for non-safety modes. \see EngineCapability
     //! Simultaneously enabling GPU fallback and safety-restricted modes is disallowed.
     //!
-    virtual void allowGPUFallback(bool setFallBackMode) = 0;
+    //! \deprecated API will be removed in a future release, use IBuilderConfig::setFlag instead.
+    //!
+    TRT_DEPRECATED virtual void allowGPUFallback(bool setFallBackMode) TRTNOEXCEPT = 0;
 
     //!
-    //! \brief Returns number of DLA hardware cores accessible.
+    //! \brief Return the number of DLA engines available to this builder.
     //!
-    virtual int getNbDLACores() const = 0;
+    virtual int getNbDLACores() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the DLA core that the engine must execute on.
-    //! \param dlaCore The DLA core to execute the engine on (0 to N-1, where N is the maximum number of DLA cores present on the device). Default value is 0.
-    //! DLA Core is not a property of the engine that is preserved by serialization: when the engine is deserialized it will be associated with the DLA core which is configured for the runtime.
+    //! \param dlaCore The DLA core to execute the engine on (0 to N-1, where N is the maximum number of DLA cores
+    //! present on the device). Default value is 0.
+    //! DLA Core is not a property of the engine that is preserved by serialization: when the engine is deserialized
+    //! it will be associated with the DLA core which is configured for the runtime.
     //! \see IRuntime::setDLACore() getDLACore()
     //!
-    virtual void setDLACore(int dlaCore) = 0;
+    //! \deprecated API will be removed in a future release, use IBuilderConfig::setDLACore instead.
+    //!
+    TRT_DEPRECATED virtual void setDLACore(int dlaCore) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the DLA core that the engine executes on.
     //! \return If setDLACore is called, returns DLA core from 0 to N-1, else returns 0.
     //!
-    virtual int getDLACore() const = 0;
+    //! \deprecated API will be removed in a future release, use IBuilderConfig::getDLACore instead.
+    //!
+    TRT_DEPRECATED virtual int getDLACore() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Resets the builder state
     //!
-    virtual void reset(nvinfer1::INetworkDefinition& network) = 0;
+    //! \deprecated API will be removed in a future release, use IBuilder::reset() instead.
+    //!
+    TRT_DEPRECATED virtual void reset(nvinfer1::INetworkDefinition& network) TRTNOEXCEPT = 0;
 
 protected:
-    virtual ~IBuilder() {}
+    virtual ~IBuilder()
+    {
+    }
 
 public:
     //!
     //! \brief Set the GPU allocator.
-    //! \param allocator Set the GPU allocator to be used by the builder. All GPU memory acquired will use this allocator. If NULL is passed, the default allocator will be used.
+    //! \param allocator Set the GPU allocator to be used by the builder. All GPU memory acquired will use this
+    //! allocator. If NULL is passed, the default allocator will be used.
     //!
     //! Default: uses cudaMalloc/cudaFree.
     //!
-    //! \note This allocator will be passed to any engines created via the builder; thus the lifetime of the allocator must span the lifetime of those engines as
+    //! \note This allocator will be passed to any engines created via the builder; thus the lifetime of the allocator
+    //! must span the lifetime of those engines as
     //! well as that of the builder. If nullptr is passed, the default allocator will be used.
     //!
-    virtual void setGpuAllocator(IGpuAllocator* allocator) = 0;
+    virtual void setGpuAllocator(IGpuAllocator* allocator) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set whether or not 16-bit kernels are permitted.
@@ -5555,22 +5725,28 @@ class IBuilder
     //!
     //! \see getFp16Mode()
     //!
-    virtual void setFp16Mode(bool mode) = 0;
+    //! \deprecated API will be removed in a future release, use IBuilderConfig::setFlag instead.
+    //!
+    TRT_DEPRECATED virtual void setFp16Mode(bool mode) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Query whether 16-bit kernels are permitted.
     //!
     //! \see setFp16Mode()
     //!
-    virtual bool getFp16Mode() const = 0;
+    //! \deprecated API will be removed in a future release, use IBuilderConfig::getFlag instead.
+    //!
+    TRT_DEPRECATED virtual bool getFp16Mode() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set whether or not type constraints are strict.
     //!
-    //! When strict type constraints are in use, TensorRT will always choose a layer implementation that conforms to the type constraints
-    //! specified, if one exists. If this flag is not set, a higher-precision implementation may be chosen if it results in higher performance.
+    //! When strict type constraints are in use, TensorRT will always choose a layer implementation that conforms to the
+    //! type constraints specified, if one exists. If this flag is not set, a higher-precision implementation may be
+    //! chosen if it results in higher performance.
     //!
-    //! If no conformant layer exists, TensorRT will choose a non-conformant layer if available regardless of the setting of this flag.
+    //! If no conformant layer exists, TensorRT will choose a non-conformant layer if available regardless of the
+    //! setting of this flag.
     //!
     //! See the developer guide for the definition of strictness.
     //!
@@ -5578,279 +5754,125 @@ class IBuilder
     //!
     //! \see getStrictTypeConstraints()
     //!
-    virtual void setStrictTypeConstraints(bool mode) = 0;
+    //! \deprecated API will be removed in a future release, use IBuilderConfig::setFlag instead.
+    //!
+    TRT_DEPRECATED virtual void setStrictTypeConstraints(bool mode) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Query whether or not type constraints are strict.
     //!
     //! \see setStrictTypeConstraints()
     //!
-    virtual bool getStrictTypeConstraints() const = 0;
+    //! \deprecated API will be removed in a future release, use IBuilderConfig::getFlag instead.
+    //!
+    TRT_DEPRECATED virtual bool getStrictTypeConstraints() const TRTNOEXCEPT = 0;
 
     //!
     //! Set whether engines will be refittable.
     //!
-    virtual void setRefittable(bool canRefit) = 0;
+    //! \deprecated API will be removed in a future release, use IBuilderConfig::setFlag instead.
+    //!
+    TRT_DEPRECATED virtual void setRefittable(bool canRefit) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Query whether or not engines will be refittable.
     //!
     //! \see getRefittable()
     //!
-    virtual bool getRefittable() const = 0;
+    //! \deprecated API will be removed in a future release, use IBuilderConfig::getFlag instead.
+    //!
+    TRT_DEPRECATED virtual bool getRefittable() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Configure the builder to target specified EngineCapability flow.
     //!
-    virtual void setEngineCapability(EngineCapability capability) = 0;
+    //! \deprecated API will be removed in a future release, use IBuilderConfig::setEngineCapability instead.
+    //!
+    TRT_DEPRECATED virtual void setEngineCapability(EngineCapability capability) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Query EngineCapability flow configured for the builder.
     //!
     //! \see setEngineCapability()
     //!
-    virtual EngineCapability getEngineCapability() const = 0;
-};
-
-//!
-//! \enum WeightsRole
-//! \brief How a layer uses particular Weights.
-//!
-//! The power weights of an IScaleLayer are omitted.  Refitting those is not supported.
-//!
-enum class WeightsRole : int
-{
-    kKERNEL = 0,   //!< kernel for IConvolutionLayer, IDeconvolutionLayer, or IFullyConnectedLayer
-    kBIAS = 1,     //!< bias for IConvolutionLayer, IDeconvolutionLayer, or IFullyConnectedLayer
-    kSHIFT = 2,    //!< shift part of IScaleLayer
-    kSCALE = 3,    //!< scale part of IScaleLayer
-    kCONSTANT = 4, //!< weights for IConstantLayer
-};
-
-template <>
-inline int EnumMax<WeightsRole>()
-{
-    return 5;
-} //!< Maximum number of elements in WeightsRole enum. \see WeightsRole
-
-//!
-//! \class IRefitter
-//!
-//! \brief Updates weights in an engine.
-//!
-//! \warning Do not inherit from this class, as doing so will break forward-compatibility of the API and ABI.
-//!
-class IRefitter
-{
-public:
-    //!
-    //! \brief Specify new weights for a layer of given name.
-    //! Returns true on success, or false if new weights are rejected.
-    //! Possible reasons for rejection are:
-    //!
-    //! * There is no such layer by that name.
-    //! * The layer does not have weights with the specified role.
-    //! * The number of weights is inconsistent with the layer’s original specification.
-    //!
-    //! Modifying the weights before method refit() completes will result in undefined behavior.
-    virtual bool setWeights(const char* layerName,
-                            WeightsRole role, Weights weights)
-        = 0;
-
-    //!
-    //! \brief Updates associated engine.  Return true if successful.
-    //!
-    //! Failure occurs if getMissing() != 0 before the call.
+    //! \deprecated API will be removed in a future release, use IBuilderConfig::getEngineCapability instead.
     //!
-    virtual bool refitCudaEngine() = 0;
+    TRT_DEPRECATED virtual EngineCapability getEngineCapability() const TRTNOEXCEPT = 0;
 
     //!
-    //! \brief Get description of missing weights.
-    //!
-    //! For example, if some Weights have been set, but the engine was optimized
-    //! in a way that combines weights, any unsupplied Weights in the combination
-    //! are considered missing.
-    //!
-    //! \param size The number of items that can be safely written to a non-null layerNames or roles.
-    //! \param layerNames Where to write the layer names.
-    //! \param roles Where to write the weights roles.
-    //!
-    //! \return The number of missing Weights.
-    //!
-    //! If layerNames!=nullptr, each written pointer points to a string owned by
-    //! the engine being refitted, and becomes invalid when the engine is destroyed.
-    //!
-    virtual int getMissing(int size, const char** layerNames, WeightsRole* roles) = 0;
-
-    //!
-    //! \brief Get description of all weights that could be refit.
-    //!
-    //! \param size The number of items that can be safely written to a non-null layerNames or roles.
-    //! \param layerNames Where to write the layer names.
-    //! \param roles Where to write the weights roles.
+    //! \brief Create a builder configuration object.
     //!
-    //! \return The number of Weights that could be refit.
+    //! \see IBuilderConfig
     //!
-    //! If layerNames!=nullptr, each written pointer points to a string owned by
-    //! the engine being refitted, and becomes invalid when the engine is destroyed.
-    //!
-    virtual int getAll(int size, const char** layerNames, WeightsRole* roles) = 0;
-
-    virtual void destroy() = 0;
-
-protected:
-    virtual ~IRefitter() {}
-};
+    virtual nvinfer1::IBuilderConfig* createBuilderConfig() TRTNOEXCEPT = 0;
 
-//!
-//! \class IPluginFactory
-//!
-//! \brief Plugin factory for deserialization.
-//!
-//! This Interface is guaranteed not to change for the same major version of TensorRT.
-class IPluginFactory
-{
-public:
-    //!
-    //! \brief Create a plugin from serialized data.
-    //!
-    //! Responsibility of destroying this plugin lies with the application.
-    //! It can be done anytime after consumers of this plugin are destroyed.
     //!
-    //! \param layerName The name of the layer.
-    //! \param serialData The serialized data.
-    //! \param serialLength The length of the serialized data.
-    //!
-    //! \return The plugin.
+    //! \brief Builds an engine for the given INetworkDefinition and given IBuilderConfig.
     //!
-    //! \see IPlugin::serialize()
+    //! It enables the builder to build multiple engines based on the same network definition, but with different
+    //! builder configurations.
     //!
-    virtual IPlugin* createPlugin(const char* layerName, const void* serialData, size_t serialLength) = 0;
-};
+    virtual nvinfer1::ICudaEngine* buildEngineWithConfig(
+        INetworkDefinition& network, IBuilderConfig& config) TRTNOEXCEPT = 0;
 
-//!
-//! \class IRuntime
-//!
-//! \brief Allows a serialized engine to be deserialized.
-//!
-//! \warning Do not inherit from this class, as doing so will break forward-compatibility of the API and ABI.
-//!
-class IRuntime
-{
-public:
+    //! \brief Create a network definition object
     //!
-    //! \brief Deserialize an engine from a stream.
+    //! Creates a network definition object with immutable properties specified using the flags parameter. Providing
+    //! the kDEFAULT flag as parameter mimics the behaviour of createNetwork(). CreateNetworkV2 supports dynamic shapes
+    //! and explicit batch dimensions when used with NetworkDefinitionCreationFlag::kEXPLICIT_BATCH flag.
     //!
-    //! \param blob The memory that holds the serialized engine.
-    //! \param size The size of the memory.
-    //! \param pluginFactory The plugin factory, if any plugins are used by the network, otherwise nullptr.
+    //! \param flags Bitset of NetworkDefinitionCreationFlags specifying network properties
     //!
-    //! \return The engine, or nullptr if it could not be deserialized.
+    //! \see INetworkDefinition, NetworkDefinitionCreationFlags
     //!
-    virtual nvinfer1::ICudaEngine* deserializeCudaEngine(const void* blob, std::size_t size, IPluginFactory* pluginFactory) = 0;
+    virtual nvinfer1::INetworkDefinition* createNetworkV2(NetworkDefinitionCreationFlags flags) TRTNOEXCEPT = 0;
 
+    //! \brief Create a new optimization profile.
     //!
-    //! \brief Set the DLA core that the deserialized engine must execute on.
-    //! \param dlaCore The DLA core to execute the engine on (0 to N-1, where N is the maximum number of DLA's present on the device). Default value is 0.
-    //! \see getDLACore()
-    //!
-    virtual void setDLACore(int dlaCore) = 0;
-
+    //! If the network has any dynamic input tensors, the appropriate calls to setDimensions() must be made.
+    //! Likewise, if there are any shape input tensors, the appropriate calls to setShapeValues() are required.
+    //! The builder retains ownership of the created optimization profile and returns a raw pointer, i.e. the users
+    //! must not attempt to delete the returned pointer.
     //!
-    //! \brief Get the DLA core that the engine executes on.
-    //! \return If setDLACore is called, returns DLA core from 0 to N-1, else returns 0.
+    //! \see IOptimizationProfile
     //!
-    virtual int getDLACore() const = 0;
+    virtual nvinfer1::IOptimizationProfile* createOptimizationProfile() noexcept = 0;
 
     //!
-    //! \brief Returns number of DLA hardware cores accessible.
+    //! \brief Set the ErrorRecorder for this interface
     //!
-    virtual int getNbDLACores() const = 0;
-
+    //! Assigns the ErrorRecorder to this interface. The ErrorRecorder will track all errors during execution.
+    //! This function will call incRefCount of the registered ErrorRecorder at least once. Setting
+    //! recorder to nullptr unregisters the recorder with the interface, resulting in a call to decRefCount if
+    //! a recorder has been registered.
     //!
-    //! \brief Destroy this object.
+    //! \param recorder The error recorder to register with this interface.
+    //
+    //! \see getErrorRecorder
     //!
-    virtual void destroy() = 0;
+    virtual void setErrorRecorder(IErrorRecorder* recorder) TRTNOEXCEPT = 0;
 
-protected:
-    virtual ~IRuntime() {}
-
-public:
-    //!
-    //! \brief Set the GPU allocator.
-    //! \param allocator Set the GPU allocator to be used by the runtime. All GPU memory acquired will use this allocator. If NULL is passed, the default allocator will be used.
-    //!
-    //! Default: uses cudaMalloc/cudaFree.
     //!
-    //! If nullptr is passed, the default allocator will be used.
+    //! \brief get the ErrorRecorder assigned to this interface.
     //!
-    virtual void setGpuAllocator(IGpuAllocator* allocator) = 0;
-};
-
-//!
-//! \class ILogger
-//!
-//! \brief Application-implemented logging interface for the builder, engine and runtime.
-//!
-//! Note that although a logger is passed on creation to each instance of a IBuilder or IRuntime interface, the logger is internally considered a singleton, and thus
-//! multiple instances of IRuntime and/or IBuilder must all use the same logger.
-//!
-class ILogger
-{
-public:
+    //! Retrieves the assigned error recorder object for the given class. A default error recorder does not exist,
+    //! so a nullptr will be returned if setErrorRecorder has not been called.
     //!
-    //! \enum Severity
+    //! \return A pointer to the IErrorRecorder object that has been registered.
     //!
-    //! The severity corresponding to a log message.
+    //! \see setErrorRecorder
     //!
-    enum class Severity
-    {
-        kINTERNAL_ERROR = 0, //!< An internal error has occurred. Execution is unrecoverable.
-        kERROR = 1,          //!< An application error has occurred.
-        kWARNING = 2,        //!< An application error has been discovered, but TensorRT has recovered or fallen back to a default.
-        kINFO = 3,           //!< Informational messages with instructional information.
-        kVERBOSE = 4,        //!< Verbose messages with debugging information.
-    };
+    virtual IErrorRecorder* getErrorRecorder() const TRTNOEXCEPT = 0;
 
     //!
-    //! A callback implemented by the application to handle logging messages;
+    //! \brief Resets the builder state to default values.
     //!
-    //! \param severity The severity of the message.
-    //! \param msg The log message, null terminated.
-    //!
-    virtual void log(Severity severity, const char* msg) = 0;
-
-    virtual ~ILogger() {}
+    virtual void reset() TRTNOEXCEPT = 0;
 };
 
-template <>
-inline int EnumMax<ILogger::Severity>()
-{
-    return 5;
-} //!< Maximum number of elements in ILogger::Severity enum. \see ILogger::Severity
-
 } // namespace nvinfer1
 
-extern "C" TENSORRTAPI void* createInferBuilder_INTERNAL(void* logger, int version);                //!< Internal C entry point for creating IBuilder.
-extern "C" TENSORRTAPI void* createInferRefitter_INTERNAL(void* engine, void* logger, int version); //!< Internal C entry point for creating IRefitter.
-extern "C" TENSORRTAPI void* createInferRuntime_INTERNAL(void* logger, int version);                //!< Internal C entry point for creating IRuntime.
-
-//!
-//! \brief Return the logger object.
-//!
-extern "C" TENSORRTAPI nvinfer1::ILogger* getLogger();
-
-//!
-//! \brief Return the library version number.
-//!
-//! The format is as for TENSORRT_VERSION: (TENSORRT_MAJOR * 1000) + (TENSORRT_MINOR * 100) + TENSOR_PATCH.
-//!
-extern "C" TENSORRTAPI int getInferLibVersion();
-
-//!
-//! \brief Return the plugin registry
-//!
-extern "C" TENSORRTAPI nvinfer1::IPluginRegistry* getPluginRegistry();
+extern "C" TENSORRTAPI void* createInferBuilder_INTERNAL(void* logger, int version); //!< Internal C entry point for creating IBuilder.
 
 namespace nvinfer1
 {
@@ -5859,51 +5881,15 @@ namespace nvinfer1
 //!
 //! This class is the logging class for the builder.
 //!
-namespace // unnamed namespace in case the compiler doesn't inline these
+//! unnamed namespace avoids linkage surprises when linking objects built with different versions of this header.
+//!
+namespace
 {
 inline IBuilder* createInferBuilder(ILogger& logger)
 {
     return static_cast<IBuilder*>(createInferBuilder_INTERNAL(&logger, NV_TENSORRT_VERSION));
 }
-
-//!
-//! \brief Create an instance of an IRefitter class.
-//!
-//! This class is the logging class for the refitter.
-//!
-inline IRefitter* createInferRefitter(ICudaEngine& engine, ILogger& logger)
-{
-    return static_cast<IRefitter*>(createInferRefitter_INTERNAL(&engine, &logger, NV_TENSORRT_VERSION));
-}
-
-//!
-//! \brief Create an instance of an IRuntime class.
-//!
-//! This class is the logging class for the runtime.
-//!
-inline IRuntime* createInferRuntime(ILogger& logger)
-{
-    return static_cast<IRuntime*>(createInferRuntime_INTERNAL(&logger, NV_TENSORRT_VERSION));
-}
 }
-
-//!
-//! \brief Register the plugin creator to the registry
-//! The static registry object will be instantiated when the plugin library is
-//! loaded. This static object will register all creators available in the
-//! library to the registry.
-//!
-template <typename T>
-class PluginRegistrar
-{
-public:
-    PluginRegistrar() { getPluginRegistry()->registerCreator(instance, ""); }
-private:
-    T instance{};
-};
-
-#define REGISTER_TENSORRT_PLUGIN(name) \
-    static nvinfer1::PluginRegistrar<name> pluginRegistrar##name {}
 }
 
 #endif
diff --git a/include/NvInferPlugin.h b/include/NvInferPlugin.h
index e6ff5861..5d06aa92 100644
--- a/include/NvInferPlugin.h
+++ b/include/NvInferPlugin.h
@@ -19,270 +19,14 @@
 
 #include "NvInfer.h"
 #include "NvInferPluginUtils.h"
-
 //!
 //! \file NvInferPlugin.h
 //!
 //! This is the API for the Nvidia provided TensorRT plugins.
 //!
 
-namespace nvinfer1
-{
-namespace plugin
-{
-
-//!
-//! \brief The PriorBox plugin layer generates the prior boxes of designated sizes and aspect ratios across all dimensions (H x W).
-//! PriorBoxParameters defines a set of parameters for creating the PriorBox plugin layer.
-//! It contains:
-//! \param minSize Minimum box size in pixels. Can not be nullptr.
-//! \param maxSize Maximum box size in pixels. Can be nullptr.
-//! \param aspectRatios Aspect ratios of the boxes. Can be nullptr.
-//! \param numMinSize Number of elements in minSize. Must be larger than 0.
-//! \param numMaxSize Number of elements in maxSize. Can be 0 or same as numMinSize.
-//! \param numAspectRatios Number of elements in aspectRatios. Can be 0.
-//! \param flip If true, will flip each aspect ratio. For example, if there is aspect ratio "r", the aspect ratio "1.0/r" will be generated as well.
-//! \param clip If true, will clip the prior so that it is within [0,1].
-//! \param variance Variance for adjusting the prior boxes.
-//! \param imgH Image height. If 0, then the H dimension of the data tensor will be used.
-//! \param imgW Image width. If 0, then the W dimension of the data tensor will be used.
-//! \param stepH Step in H. If 0, then (float)imgH/h will be used where h is the H dimension of the 1st input tensor.
-//! \param stepW Step in W. If 0, then (float)imgW/w will be used where w is the W dimension of the 1st input tensor.
-//! \param offset Offset to the top left corner of each cell.
-//!
-struct PriorBoxParameters
-{
-    float *minSize, *maxSize, *aspectRatios;
-    int numMinSize, numMaxSize, numAspectRatios;
-    bool flip;
-    bool clip;
-    float variance[4];
-    int imgH, imgW;
-    float stepH, stepW;
-    float offset;
-};
-
-//!
-//! \brief The Anchor Generator plugin layer generates the prior boxes of designated sizes and aspect ratios across all dimensions (H x W).
-//! GridAnchorParameters defines a set of parameters for creating the plugin layer for all feature maps.
-//! It contains:
-//! \param minScale Scale of anchors corresponding to finest resolution.
-//! \param maxScale Scale of anchors corresponding to coarsest resolution.
-//! \param aspectRatios List of aspect ratios to place on each grid point.
-//! \param numAspectRatios Number of elements in aspectRatios.
-//! \param H Height of feature map to generate anchors for.
-//! \param W Width of feature map to generate anchors for.
-//! \param variance Variance for adjusting the prior boxes.
-//!
-struct GridAnchorParameters
-{
-    float minSize, maxSize;
-    float* aspectRatios;
-    int numAspectRatios, H, W;
-    float variance[4];
-};
-
-//!
-//! \enum CodeTypeSSD
-//! \brief The type of encoding used for decoding the bounding boxes and loc_data.
-//!
-enum class CodeTypeSSD : int
-{
-    CORNER = 0,      //!< Use box corners.
-    CENTER_SIZE = 1, //!< Use box centers and size.
-    CORNER_SIZE = 2, //!< Use box centers and size.
-    TF_CENTER = 3    //!< Use box centers and size but flip x and y coordinates.
-};
-
-//!
-//! \brief The DetectionOutput plugin layer generates the detection output based on location and confidence predictions by doing non maximum suppression.
-//! This plugin first decodes the bounding boxes based on the anchors generated. It then performs non_max_suppression on the decoded bouding boxes.
-//! DetectionOutputParameters defines a set of parameters for creating the DetectionOutput plugin layer.
-//! It contains:
-//! \param shareLocation If true, bounding box are shared among different classes.
-//! \param varianceEncodedInTarget If true, variance is encoded in target. Otherwise we need to adjust the predicted offset accordingly.
-//! \param backgroundLabelId Background label ID. If there is no background class, set it as -1.
-//! \param numClasses Number of classes to be predicted.
-//! \param topK Number of boxes per image with top confidence scores that are fed into the NMS algorithm.
-//! \param keepTopK Number of total bounding boxes to be kept per image after NMS step.
-//! \param confidenceThreshold Only consider detections whose confidences are larger than a threshold.
-//! \param nmsThreshold Threshold to be used in NMS.
-//! \param codeType Type of coding method for bbox.
-//! \param inputOrder Specifies the order of inputs {loc_data, conf_data, priorbox_data}.
-//! \param confSigmoid Set to true to calculate sigmoid of confidence scores.
-//! \param isNormalized Set to true if bounding box data is normalized by the network.
-//!
-struct DetectionOutputParameters
-{
-    bool shareLocation, varianceEncodedInTarget;
-    int backgroundLabelId, numClasses, topK, keepTopK;
-    float confidenceThreshold, nmsThreshold;
-    CodeTypeSSD codeType;
-    int inputOrder[3];
-    bool confSigmoid;
-    bool isNormalized;
-};
-
-//!
-//! \brief The Region plugin layer performs region proposal calculation: generate 5 bounding boxes per cell (for yolo9000, generate 3 bounding boxes per cell).
-//! For each box, calculating its probablities of objects detections from 80 pre-defined classifications (yolo9000 has 9416 pre-defined classifications,
-//! and these 9416 items are organized as work-tree structure).
-//! RegionParameters defines a set of parameters for creating the Region plugin layer.
-//! \param num Number of predicted bounding box for each grid cell.
-//! \param coords Number of coordinates for a bounding box.
-//! \param classes Number of classfications to be predicted.
-//! \param softmaxTree When performing yolo9000, softmaxTree is helping to do softmax on confidence scores, for element to get the precise classfication through word-tree structured classfication definition.
-//! \deprecated. This plugin is superseded by createRegionPlugin()
-//!
-typedef struct
-{
-    int* leaf;
-    int n;
-    int* parent;
-    int* child;
-    int* group;
-    char** name;
-
-    int groups;
-    int* groupSize;
-    int* groupOffset;
-} softmaxTree; // softmax tree
-
-struct RegionParameters
-{
-    int num;
-    int coords;
-    int classes;
-    softmaxTree* smTree;
-};
-
-//!
-//! \brief The NMSParameters are used by the BatchedNMSPlugin for performing
-//! the non_max_suppression operation over boxes for object detection networks.
-//! \param shareLocation If set to true, the boxes inputs are shared across all
-//!        classes. If set to false, the boxes input should account for per class box data.
-//! \param backgroundLabelId Label ID for the background class. If there is no background class, set it as -1
-//! \param numClasses Number of classes in the network.
-//! \param topK Number of bounding boxes to be fed into the NMS step.
-//! \param keepTopK Number of total bounding boxes to be kept per image after NMS step.
-//!        Should be less than or equal to the topK value.
-//! \param scoreThreshold Scalar threshold for score (low scoring boxes are removed).
-//! \param iouThreshold scalar threshold for IOU (new boxes that have high IOU overlap
-//!        with previously selected boxes are removed).
-//! \param isNormalized Set to false, if the box coordinates are not
-//!        normalized, i.e. not in the range [0,1]. Defaults to true.
-//!
-
-struct NMSParameters
-{
-    bool shareLocation;
-    int backgroundLabelId, numClasses, topK, keepTopK;
-    float scoreThreshold, iouThreshold;
-    bool isNormalized;
-};
-
-} // end plugin namespace
-} // end nvinfer1 namespace
-
 extern "C"
 {
-//!
-//! \brief Create a plugin layer that fuses the RPN and ROI pooling using user-defined parameters.
-//! Registered plugin type "RPROI_TRT". Registered plugin version "1".
-//! \param featureStride Feature stride.
-//! \param preNmsTop Number of proposals to keep before applying NMS.
-//! \param nmsMaxOut Number of remaining proposals after applying NMS.
-//! \param iouThreshold IoU threshold.
-//! \param minBoxSize Minimum allowed bounding box size before scaling.
-//! \param spatialScale Spatial scale between the input image and the last feature map.
-//! \param pooling Spatial dimensions of pooled ROIs.
-//! \param anchorRatios Aspect ratios for generating anchor windows.
-//! \param anchorScales Scales for generating anchor windows.
-//!
-//! \return Returns a FasterRCNN fused RPN+ROI pooling plugin. Returns nullptr on invalid inputs.
-//!
-TENSORRTAPI nvinfer1::IPluginV2* createRPNROIPlugin(int featureStride, int preNmsTop,
-                                                                int nmsMaxOut, float iouThreshold, float minBoxSize,
-                                                                float spatialScale, nvinfer1::DimsHW pooling,
-                                                                nvinfer1::Weights anchorRatios, nvinfer1::Weights anchorScales);
-
-//!
-//! \brief The Normalize plugin layer normalizes the input to have L2 norm of 1 with scale learnable.
-//! Registered plugin type "Normalize_TRT". Registered plugin version "1".
-//! \param scales Scale weights that are applied to the output tensor.
-//! \param acrossSpatial Whether to compute the norm over adjacent channels (acrossSpatial is true) or nearby spatial locations (within channel in which case acrossSpatial is false).
-//! \param channelShared Whether the scale weight(s) is shared across channels.
-//! \param eps Epsilon for not diviiding by zero.
-//!
-TENSORRTAPI nvinfer1::IPluginV2* createNormalizePlugin(const nvinfer1::Weights* scales, bool acrossSpatial, bool channelShared, float eps);
-
-//!
-//! \brief The PriorBox plugin layer generates the prior boxes of designated sizes and aspect ratios across all dimensions (H x W).
-//! PriorBoxParameters defines a set of parameters for creating the PriorBox plugin layer.
-//! Registered plugin type "PriorBox_TRT". Registered plugin version "1".
-//!
-TENSORRTAPI nvinfer1::IPluginV2* createPriorBoxPlugin(nvinfer1::plugin::PriorBoxParameters param);
-
-//!
-//! \brief The Grid Anchor Generator plugin layer generates the prior boxes of
-//! designated sizes and aspect ratios across all dimensions (H x W) for all feature maps.
-//! GridAnchorParameters defines a set of parameters for creating the GridAnchorGenerator plugin layer.
-//! Registered plugin type "GridAnchor_TRT". Registered plugin version "1".
-//!
-TENSORRTAPI nvinfer1::IPluginV2* createAnchorGeneratorPlugin(nvinfer1::plugin::GridAnchorParameters* param, int numLayers);
-
-//!
-//! \brief The DetectionOutput plugin layer generates the detection output based on location and confidence predictions by doing non maximum suppression.
-//! DetectionOutputParameters defines a set of parameters for creating the DetectionOutput plugin layer.
-//! Registered plugin type "NMS_TRT". Registered plugin version "1".
-//!
-TENSORRTAPI nvinfer1::IPluginV2* createNMSPlugin(nvinfer1::plugin::DetectionOutputParameters param);
-
-//!
-//! \brief The Reorg plugin reshapes input of shape CxHxW into a (C*stride*stride)x(H/stride)x(W/stride) shape, used in YOLOv2.
-//! It does that by taking 1 x stride x stride slices from tensor and flattening them into (stridexstride) x 1 x 1 shape.
-//! Registered plugin type "Reorg_TRT". Registered plugin version "1".
-//! \param stride Strides in H and W, it should divide both H and W. Also stride * stride should be less than or equal to C.
-//!
-TENSORRTAPI nvinfer1::IPluginV2* createReorgPlugin(int stride);
-
-//!
-//! \brief The Region plugin layer performs region proposal calculation: generate 5 bounding boxes per cell (for yolo9000, generate 3 bounding boxes per cell).
-//! For each box, calculating its probablities of objects detections from 80 pre-defined classifications (yolo9000 has 9416 pre-defined classifications,
-//! and these 9416 items are organized as work-tree structure).
-//! RegionParameters defines a set of parameters for creating the Region plugin layer.
-//! Registered plugin type "Region_TRT". Registered plugin version "1".
-//!
-TENSORRTAPI nvinfer1::IPluginV2* createRegionPlugin(nvinfer1::plugin::RegionParameters params);
-
-//!
-//! \brief The BatchedNMS Plugin performs non_max_suppression on the input boxes, per batch, across all classes.
-//! It greedily selects a subset of bounding boxes in descending order of
-//! score. Prunes away boxes that have a high intersection-over-union (IOU)
-//! overlap with previously selected boxes. Bounding boxes are supplied as [y1, x1, y2, x2],
-//! where (y1, x1) and (y2, x2) are the coordinates of any
-//! diagonal pair of box corners and the coordinates can be provided as normalized
-//! (i.e., lying in the interval [0, 1]) or absolute.
-//! The plugin expects two inputs.
-//! Input0 is expected to be 4-D float boxes tensor of shape [batch_size, num_boxes,
-//! q, 4], where q can be either 1 (if shareLocation is true) or num_classes.
-//! Input1 is expected to be a 3-D float scores tensor of shape [batch_size, num_boxes, num_classes]
-//! representing a single score corresponding to each box.
-//! The plugin returns four outputs.
-//! num_detections : A [batch_size] int32 tensor indicating the number of valid
-//! detections per batch item. Can be less than keepTopK. Only the top num_detections[i] entries in
-//! nmsed_boxes[i], nmsed_scores[i] and nmsed_classes[i] are valid.
-//! nmsed_boxes : A [batch_size, max_detections, 4] float32 tensor containing
-//! the co-ordinates of non-max suppressed boxes.
-//! nmsed_scores : A [batch_size, max_detections] float32 tensor containing the
-//! scores for the boxes.
-//! nmsed_classes :  A [batch_size, max_detections] float32 tensor containing the
-//! classes for the boxes.
-//!
-//! Registered plugin type "BatchedNMS_TRT". Registered plugin version "1".
-//!
-TENSORRTAPI nvinfer1::IPluginV2* createBatchedNMSPlugin(nvinfer1::plugin::NMSParameters param);
-
 //!
 //! \brief Initialize and register all the existing TensorRT plugins to the Plugin Registry with an optional namespace.
 //! The plugin library author should ensure that this function name is unique to the library.
diff --git a/include/NvInferPluginUtils.h b/include/NvInferPluginUtils.h
index 097112e5..fbe8197d 100644
--- a/include/NvInferPluginUtils.h
+++ b/include/NvInferPluginUtils.h
@@ -16,6 +16,7 @@
  
 #ifndef NV_INFER_PLUGIN_UTILS_H
 #define NV_INFER_PLUGIN_UTILS_H
+#include "NvInferRuntimeCommon.h"
 //!
 //! \file NvPluginUtils.h
 //!
@@ -25,9 +26,78 @@
 
 namespace nvinfer1
 {
+//!
+//! \enum PluginType
+//!
+//! \brief The type values for the various plugins.
+//!
+//! \see INvPlugin::getPluginType()
+//!
+enum class PluginType : int
+{
+    kFASTERRCNN = 0,         //!< FasterRCNN fused plugin (RPN + ROI pooling).
+    kNORMALIZE = 1,          //!< Normalize plugin.
+    kPERMUTE = 2,            //!< Permute plugin.
+    kPRIORBOX = 3,           //!< PriorBox plugin.
+    kSSDDETECTIONOUTPUT = 4, //!< SSD DetectionOutput plugin.
+    kCONCAT = 5,             //!< Concat plugin.
+    kPRELU = 6,              //!< YOLO PReLU Plugin.
+    kYOLOREORG = 7,          //!< YOLO Reorg Plugin.
+    kYOLOREGION = 8,         //!< YOLO Region Plugin.
+    kANCHORGENERATOR = 9,    //!< SSD Grid Anchor Generator.
+};
+
+//!< Maximum number of elements in PluginType enum. \see PluginType
+template <>
+constexpr inline int EnumMax<PluginType>()
+{
+    return 10;
+}
+
 namespace plugin
 {
 
+//!
+//! \brief The Permute plugin layer permutes the input tensor by changing the memory order of the data.
+//! Quadruple defines a structure that contains an array of 4 integers. They can represent the permute orders or the strides in each dimension.
+//!
+typedef struct
+{
+    int data[4];
+} Quadruple;
+
+
+//!
+//! \brief The PriorBox plugin layer generates the prior boxes of designated sizes and aspect ratios across all dimensions (H x W).
+//! PriorBoxParameters defines a set of parameters for creating the PriorBox plugin layer.
+//! It contains:
+//! \param minSize Minimum box size in pixels. Can not be nullptr.
+//! \param maxSize Maximum box size in pixels. Can be nullptr.
+//! \param aspectRatios Aspect ratios of the boxes. Can be nullptr.
+//! \param numMinSize Number of elements in minSize. Must be larger than 0.
+//! \param numMaxSize Number of elements in maxSize. Can be 0 or same as numMinSize.
+//! \param numAspectRatios Number of elements in aspectRatios. Can be 0.
+//! \param flip If true, will flip each aspect ratio. For example, if there is aspect ratio "r", the aspect ratio "1.0/r" will be generated as well.
+//! \param clip If true, will clip the prior so that it is within [0,1].
+//! \param variance Variance for adjusting the prior boxes.
+//! \param imgH Image height. If 0, then the H dimension of the data tensor will be used.
+//! \param imgW Image width. If 0, then the W dimension of the data tensor will be used.
+//! \param stepH Step in H. If 0, then (float)imgH/h will be used where h is the H dimension of the 1st input tensor.
+//! \param stepW Step in W. If 0, then (float)imgW/w will be used where w is the W dimension of the 1st input tensor.
+//! \param offset Offset to the top left corner of each cell.
+//!
+struct PriorBoxParameters
+{
+    float *minSize, *maxSize, *aspectRatios;
+    int numMinSize, numMaxSize, numAspectRatios;
+    bool flip;
+    bool clip;
+    float variance[4];
+    int imgH, imgW;
+    float stepH, stepW;
+    float offset;
+};
+
 //!
 //! \brief RPROIParams is used to create the RPROIPlugin instance.
 //! It contains:
@@ -56,6 +126,127 @@ struct RPROIParams
     float spatialScale;
 };
 
+
+//!
+//! \brief The Anchor Generator plugin layer generates the prior boxes of designated sizes and aspect ratios across all dimensions (H x W).
+//! GridAnchorParameters defines a set of parameters for creating the plugin layer for all feature maps.
+//! It contains:
+//! \param minScale Scale of anchors corresponding to finest resolution.
+//! \param maxScale Scale of anchors corresponding to coarsest resolution.
+//! \param aspectRatios List of aspect ratios to place on each grid point.
+//! \param numAspectRatios Number of elements in aspectRatios.
+//! \param H Height of feature map to generate anchors for.
+//! \param W Width of feature map to generate anchors for.
+//! \param variance Variance for adjusting the prior boxes.
+//!
+struct GridAnchorParameters
+{
+    float minSize, maxSize;
+    float* aspectRatios;
+    int numAspectRatios, H, W;
+    float variance[4];
+};
+
+//!
+//! \enum CodeTypeSSD
+//! \brief The type of encoding used for decoding the bounding boxes and loc_data.
+//!
+enum class CodeTypeSSD : int
+{
+    CORNER = 0,      //!< Use box corners.
+    CENTER_SIZE = 1, //!< Use box centers and size.
+    CORNER_SIZE = 2, //!< Use box centers and size.
+    TF_CENTER = 3    //!< Use box centers and size but flip x and y coordinates.
+};
+
+//!
+//! \brief The DetectionOutput plugin layer generates the detection output based on location and confidence predictions by doing non maximum suppression.
+//! This plugin first decodes the bounding boxes based on the anchors generated. It then performs non_max_suppression on the decoded bouding boxes.
+//! DetectionOutputParameters defines a set of parameters for creating the DetectionOutput plugin layer.
+//! It contains:
+//! \param shareLocation If true, bounding box are shared among different classes.
+//! \param varianceEncodedInTarget If true, variance is encoded in target. Otherwise we need to adjust the predicted offset accordingly.
+//! \param backgroundLabelId Background label ID. If there is no background class, set it as -1.
+//! \param numClasses Number of classes to be predicted.
+//! \param topK Number of boxes per image with top confidence scores that are fed into the NMS algorithm.
+//! \param keepTopK Number of total bounding boxes to be kept per image after NMS step.
+//! \param confidenceThreshold Only consider detections whose confidences are larger than a threshold.
+//! \param nmsThreshold Threshold to be used in NMS.
+//! \param codeType Type of coding method for bbox.
+//! \param inputOrder Specifies the order of inputs {loc_data, conf_data, priorbox_data}.
+//! \param confSigmoid Set to true to calculate sigmoid of confidence scores.
+//! \param isNormalized Set to true if bounding box data is normalized by the network.
+//!
+struct DetectionOutputParameters
+{
+    bool shareLocation, varianceEncodedInTarget;
+    int backgroundLabelId, numClasses, topK, keepTopK;
+    float confidenceThreshold, nmsThreshold;
+    CodeTypeSSD codeType;
+    int inputOrder[3];
+    bool confSigmoid;
+    bool isNormalized;
+};
+
+
+//!
+//! \brief The Region plugin layer performs region proposal calculation: generate 5 bounding boxes per cell (for yolo9000, generate 3 bounding boxes per cell).
+//! For each box, calculating its probablities of objects detections from 80 pre-defined classifications (yolo9000 has 9418 pre-defined classifications,
+//! and these 9418 items are organized as work-tree structure).
+//! RegionParameters defines a set of parameters for creating the Region plugin layer.
+//! \param num Number of predicted bounding box for each grid cell.
+//! \param coords Number of coordinates for a bounding box.
+//! \param classes Number of classfications to be predicted.
+//! \param softmaxTree When performing yolo9000, softmaxTree is helping to do softmax on confidence scores, for element to get the precise classfication through word-tree structured classfication definition.
+//! \deprecated. This plugin is superseded by createRegionPlugin()
+//!
+TRT_DEPRECATED typedef struct
+{
+    int* leaf;
+    int n;
+    int* parent;
+    int* child;
+    int* group;
+    char** name;
+
+    int groups;
+    int* groupSize;
+    int* groupOffset;
+} softmaxTree; // softmax tree
+
+struct TRT_DEPRECATED RegionParameters
+{
+    int num;
+    int coords;
+    int classes;
+    softmaxTree* smTree;
+};
+
+//!
+//! \brief The NMSParameters are used by the BatchedNMSPlugin for performing
+//! the non_max_suppression operation over boxes for object detection networks.
+//! \param shareLocation If set to true, the boxes inputs are shared across all
+//!        classes. If set to false, the boxes input should account for per class box data.
+//! \param backgroundLabelId Label ID for the background class. If there is no background class, set it as -1
+//! \param numClasses Number of classes in the network.
+//! \param topK Number of bounding boxes to be fed into the NMS step.
+//! \param keepTopK Number of total bounding boxes to be kept per image after NMS step.
+//!        Should be less than or equal to the topK value.
+//! \param scoreThreshold Scalar threshold for score (low scoring boxes are removed).
+//! \param iouThreshold scalar threshold for IOU (new boxes that have high IOU overlap
+//!        with previously selected boxes are removed).
+//! \param isNormalized Set to false, if the box coordinates are not
+//!        normalized, i.e. not in the range [0,1]. Defaults to false.
+//!
+
+struct NMSParameters
+{
+    bool shareLocation;
+    int backgroundLabelId, numClasses, topK, keepTopK;
+    float scoreThreshold, iouThreshold;
+    bool isNormalized;
+};
+
 } // end plugin namespace
 } // end nvinfer1 namespace
 #endif
diff --git a/include/NvInferRuntime.h b/include/NvInferRuntime.h
new file mode 100644
index 00000000..5788e75a
--- /dev/null
+++ b/include/NvInferRuntime.h
@@ -0,0 +1,1773 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef NV_INFER_RUNTIME_H
+#define NV_INFER_RUNTIME_H
+
+//!
+//! \file NvInferRuntime.h
+//!
+//! This is the top-level API file for TensorRT extended runtime library.
+//!
+
+#include "NvInferRuntimeCommon.h"
+
+namespace nvinfer1
+{
+
+class IExecutionContext; //!< Forward declaration of IExecutionContext for use by other interfaces.
+class ICudaEngine; //!< Forward declaration of ICudaENgine for use by other interfaces.
+class IPluginFactory; //!< Forward declaration of IPluginFactory for use by other interfaces.
+
+//!
+//! \enum EngineCapability
+//!
+//! \brief List of supported engine capability flows.
+//!
+//! \note at present, kSAFE_DLA flow doesn't strictly limit execution to DLA devices - it simply
+//! restricts the engine capabilities to DLA support levels anticipated in future releases.
+//!
+enum class EngineCapability : int
+{
+    kDEFAULT = 0,  //!< Full capability, TensorRT mode without any restrictions.
+    kSAFE_GPU = 1, //!< Safety restricted capability, TensorRT flow that can only run on GPU devices.
+    kSAFE_DLA = 2, //!< Safety restricted capability, TensorRT flow that can only run on DLA devices.
+};
+
+template <>
+constexpr inline int EnumMax<EngineCapability>()
+{
+    return 3;
+} //!< Maximum number of elements in EngineCapability enum. \see EngineCapability
+
+
+//!
+//! \class Weights
+//!
+//! \brief An array of weights used as a layer parameter.
+//!
+//! The weights are held by reference until the engine has been built. Therefore the data referenced
+//! by \p values field should be preserved until the build is complete.
+//!
+class Weights
+{
+public:
+    DataType type;      //!< The type of the weights.
+    const void* values; //!< The weight values, in a contiguous array.
+    int64_t count;      //!< The number of weights in the array.
+};
+
+//!
+//! \class IHostMemory
+//!
+//! \brief Class to handle library allocated memory that is accessible to the user.
+//!
+//! The memory allocated via the host memory object is owned by the library and will
+//! be de-allocated when the destroy method is called.
+//!
+//! \warning Do not inherit from this class, as doing so will break forward-compatibility of the API and ABI.
+//!
+class IHostMemory
+{
+public:
+    virtual void* data() const noexcept = 0;       //!< A pointer to the raw data that is owned by the library.
+    virtual std::size_t size() const noexcept = 0; //!< The size in bytes of the data that was allocated.
+    virtual DataType type() const noexcept = 0;    //!< The type of the memory that was allocated.
+    virtual void destroy() noexcept = 0;           //!< Destroy the allocated memory.
+protected:
+    virtual ~IHostMemory() {}
+};
+
+//! \class IPlugin
+//!
+//! \brief Plugin class for user-implemented layers.
+//!
+//! Plugins are a mechanism for applications to implement custom layers. Each plugin is owned by the application, and its lifetime
+//! must span any use of it by TensorRT
+//!
+class IPlugin
+{
+public:
+    //!
+    //! \brief Get the number of outputs from the layer.
+    //!
+    //! \return The number of outputs.
+    //!
+    //! This function is called by the implementations of INetworkDefinition and IBuilder. In particular, it is called prior to any call to initialize().
+    //!
+    virtual int getNbOutputs() const TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Get the dimension of an output tensor.
+    //!
+    //! \param index The index of the output tensor.
+    //! \param inputs The input tensors.
+    //! \param nbInputDims The number of input tensors.
+    //!
+    //! This function is called by the implementations of INetworkDefinition and IBuilder. In particular, it is called prior to any call to initialize().
+    //!
+    virtual Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Configure the layer.
+    //!
+    //! This function is called by the builder prior to initialize(). It provides an opportunity for the layer to make algorithm choices on the basis
+    //! of its weights, dimensions, and maximum batch size. The type is assumed to be FP32 and format NCHW.
+    //!
+    //! \param inputDims The input tensor dimensions.
+    //! \param nbInputs The number of inputs.
+    //! \param outputDims The output tensor dimensions.
+    //! \param nbOutputs The number of outputs.
+    //! \param maxBatchSize The maximum batch size.
+    //!
+    //! The dimensions passed here do not include the outermost batch size (i.e. for 2-D image networks, they will be 3-dimensional CHW dimensions).
+    //!
+    //! This method is not called for PluginExt classes, configureWithFormat is called instead.
+    //!
+    virtual void configure(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, int maxBatchSize) TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Initialize the layer for execution. This is called when the engine is created.
+    //!
+    //! \return 0 for success, else non-zero (which will cause engine termination).
+    //!
+    virtual int initialize() TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Release resources acquired during plugin layer initialization. This is called when the engine is destroyed.
+    //! \see initialize()
+    //!
+    virtual void terminate() TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Find the workspace size required by the layer.
+    //!
+    //! This function is called during engine startup, after initialize(). The workspace size returned should be sufficient for any
+    //! batch size up to the maximum.
+    //!
+    //! \return The workspace size.
+    //!
+    virtual size_t getWorkspaceSize(int maxBatchSize) const TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Execute the layer.
+    //!
+    //! \param batchSize The number of inputs in the batch.
+    //! \param inputs The memory for the input tensors.
+    //! \param outputs The memory for the output tensors.
+    //! \param workspace Workspace for execution.
+    //! \param stream The stream in which to execute the kernels.
+    //!
+    //! \return 0 for success, else non-zero (which will cause engine termination).
+    //!
+    virtual int enqueue(int batchSize, const void* const* inputs, void** outputs, void* workspace, cudaStream_t stream) TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Find the size of the serialization buffer required.
+    //!
+    //! \return The size of the serialization buffer.
+    //!
+    virtual size_t getSerializationSize() TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Serialize the layer.
+    //!
+    //! \param buffer A pointer to a buffer of size at least that returned by getSerializationSize().
+    //!
+    //! \see getSerializationSize()
+    //!
+    virtual void serialize(void* buffer) TRTNOEXCEPT = 0;
+
+    virtual ~IPlugin() {}
+};
+
+//!
+//! \class IPluginExt
+//!
+//! \brief Plugin class for user-implemented layers.
+//!
+//! Plugins are a mechanism for applications to implement custom layers. Each plugin is owned by the application, and its lifetime
+//! must span any use of it by TensorRT.
+//!
+class IPluginExt : public IPlugin
+{
+public:
+    //!
+    //! \brief Return the API version with which this plugin was built.
+    //!
+    //! Do not override this method as it is used by the TensorRT library to maintain backwards-compatibility with plugins.
+    //!
+    virtual int getTensorRTVersion() const TRTNOEXCEPT
+    {
+        return NV_TENSORRT_VERSION;
+    }
+
+    //!
+    //! \brief Check format support.
+    //!
+    //! \param type DataType requested.
+    //! \param format PluginFormat requested.
+    //! \return true if the plugin supports the type-format combination.
+    //!
+    //! This function is called by the implementations of INetworkDefinition, IBuilder, and ICudaEngine.
+    //! In particular, it is called when creating an engine and when deserializing an engine.
+    //!
+    virtual bool supportsFormat(DataType type, PluginFormat format) const TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Configure the layer.
+    //!
+    //! This function is called by the builder prior to initialize(). It provides an opportunity for the layer to make algorithm choices on the basis
+    //! of its weights, dimensions, and maximum batch size.
+    //!
+    //! \param inputDims The input tensor dimensions.
+    //! \param nbInputs The number of inputs.
+    //! \param outputDims The output tensor dimensions.
+    //! \param nbOutputs The number of outputs.
+    //! \param type The data type selected for the engine.
+    //! \param format The format selected for the engine.
+    //! \param maxBatchSize The maximum batch size.
+    //!
+    //! The dimensions passed here do not include the outermost batch size (i.e. for 2-D image networks, they will be 3-dimensional CHW dimensions).
+    //!
+    virtual void configureWithFormat(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, DataType type, PluginFormat format, int maxBatchSize) TRTNOEXCEPT = 0;
+
+    virtual ~IPluginExt() {}
+
+protected:
+    //!
+    //! \brief Derived classes should not implement this. In a C++11 API it would be override final.
+    //!
+    void configure(const Dims* /*inputDims*/, int /*nbInputs*/, const Dims* /*outputDims*/, int /*nbOutputs*/, int /*maxBatchSize*/) _TENSORRT_FINAL TRTNOEXCEPT {}
+};
+
+//!
+//! \enum DimensionOperation
+//!
+//! \brief An operation on two IDimensionExpr, which represent integer expressions used in dimension computations.
+//!
+//! For example, given two IDimensionExpr x and y and an IExprBuilder& eb,
+//! eb.operation(DimensionOperation::kSUM, x, y) creates a representation of x+y.
+//!
+//! \see IDimensionExpr, IExprBuilder
+//!
+enum class DimensionOperation : int
+{
+    kSUM = 0,       //!< Sum of the two operands.
+    kPROD = 1,      //!< Product of the two operands.
+    kMAX = 2,       //!< Maximum of the two operands.
+    kMIN = 3,       //!< Minimum of the two operands.
+    kSUB = 4,       //!< Substract the second element from the first.
+    kEQUAL = 5,     //!< 1 if operands are equal, 0 otherwise.
+    kLESS = 6,      //!< 1 if first operand is less than second operand, 0 otherwise.
+    kFLOOR_DIV = 7, //!< Floor division of the first element by the second.
+    kCEIL_DIV = 8   //!< Division rounding up
+};
+
+template <>
+constexpr inline int EnumMax<DimensionOperation>()
+{
+    return 9;
+} //!< Maximum number of elements in DimensionOperation enum. \see DimensionOperation
+
+//!
+//! \class IDimensionExpr
+//!
+//! An IDimensionExpr represents an integer expression constructed from constants,
+//! input dimensions, and binary operations.  These expressions are can be used
+//! in overrides of IPluginV2DynamicExt::getOutputDimensions to define output
+//! dimensions in terms of input dimensions.
+//!
+//! \see DimensionOperation, IPluginV2DynamicExt::getOutputDimensions
+//!
+class IDimensionExpr
+{
+public:
+    //! Return true if expression is a build-time constant.
+    virtual bool isConstant() const = 0;
+
+    //! If isConstant(), returns value of the constant.
+    //! If !isConstant(), return std::numeric_limits<int>::min().
+    virtual int getConstantValue() const = 0;
+};
+
+//!
+//! \class IExprBuilder
+//!
+//! Object for constructing IDimensionExpr.
+//!
+//! There is no public way to construct an IExprBuilder.  It appears as an argument to
+//! method IPluginV2DynamicExt::getOutputDimensions().  Overrides of that method can use
+//! that IExprBuilder argument to construct expressions that define output dimensions
+//! in terms of input dimensions.
+//!
+//! Clients should assume that any values constructed by the IExprBuilder are destroyed
+//! after IPluginV2DynamicExt::getOutputDimensions() returns.
+//!
+//! \warning Do not inherit from this class, as doing so will break forward-compatibility of the API and ABI.
+//!
+//! \see IDimensionExpr
+//!
+class IExprBuilder
+{
+public:
+    //! Return pointer to IDimensionExp for given value.
+    virtual const IDimensionExpr* constant(int value) = 0;
+
+    //! Return pointer to IDimensionExp that represents the given operation applied to first and second.
+    //! Returns nullptr if op is not a valid DimensionOperation.
+    virtual const IDimensionExpr* operation(DimensionOperation op, const IDimensionExpr& first, const IDimensionExpr& second) = 0;
+
+protected:
+    virtual ~IExprBuilder() {}
+};
+
+//!
+//! \class DimsExprs
+//!
+//! Analog of class Dims with expressions instead of constants for the dimensions.
+//!
+class DimsExprs
+{
+public:
+    int nbDims;                              //!< The number of dimensions.
+    const IDimensionExpr* d[Dims::MAX_DIMS]; //!< The extent of each dimension.
+};
+
+//!
+//! \class DynamicPluginTensorDesc
+//!
+//! Summarizes tensors that a plugin might see for an input or output.
+//!
+struct DynamicPluginTensorDesc
+{
+    //! Information required to interpret a pointer to tensor data, except that desc.dims has -1 in place of any runtime dimension.
+    PluginTensorDesc desc;
+
+    //! Lower bounds on tensor’s dimensions
+    Dims min;
+
+    //! Upper bounds on tensor’s dimensions
+    Dims max;
+};
+
+//!
+//! \class IPluginV2DynamicExt
+//!
+//! Similar to IPluginV2Ext, but with support for dynamic shapes.
+//!
+//! Clients should override the public methods, including the following inherited methods:
+//!
+//!     virtual int getNbOutputs() const TRTNOEXCEPT = 0;
+//!     virtual nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRTNOEXCEPT = 0;
+//!     virtual size_t getSerializationSize() const TRTNOEXCEPT = 0;
+//!     virtual void serialize(void* buffer) const TRTNOEXCEPT = 0;
+//!     virtual void destroy() TRTNOEXCEPT = 0;
+//!     virtual void setPluginNamespace(const char* pluginNamespace) TRTNOEXCEPT = 0;
+//!     virtual const char* getPluginNamespace() const TRTNOEXCEPT = 0;
+//!
+//! For getOutputDataType, the inputTypes will always be DataType::kFLOAT or DataType::kINT32,
+//! and the returned type is canonicalized to DataType::kFLOAT if it is DataType::kHALF or DataType:kINT8.
+//! Details about the floating-point precision are elicited later by method supportsFormatCombination.
+//!
+class IPluginV2DynamicExt : public nvinfer1::IPluginV2Ext
+{
+public:
+    IPluginV2DynamicExt* clone() const _TENSORRT_OVERRIDE TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Get expressions for computing dimensions of an output tensor from dimensions of the input tensors.
+    //!
+    //! \param outputIndex The index of the output tensor
+    //! \param inputs Expressions for dimensions of the input tensors
+    //! \param nbInputDims The number of input tensors
+    //! \param exprBuilder Object for generating new expressions
+    //!
+    //! This function is called by the implementations of IBuilder during analysis of the network.
+    //!
+    //! Example #1: A plugin has a single output that transposes the last two dimensions of the plugin's single input.
+    //! The body of the override of getOutputDimensions can be:
+    //!
+    //!     DimsExprs output(inputs[0]);
+    //!     std::swap(output.d[output.nbDims-1], output.d[output.nbDims-2]);
+    //!     return output;
+    //!
+    //! Example #2: A plugin concatenates its two inputs along the first dimension.
+    //! The body of the override of getOutputDimensions can be:
+    //!
+    //!     DimsExprs output(inputs[0]);
+    //!     output.d[0] = exprBuilder.operation(DimensionOperation::kSUM, inputs[0].d[0], inputs[1].d[0]);
+    //!     return output;
+    //!
+    virtual DimsExprs getOutputDimensions(int outputIndex, const DimsExprs* inputs, int nbInputs, IExprBuilder& exprBuilder) = 0;
+
+    //!
+    //! Limit on number of format combinations accepted.
+    //!
+    static constexpr int kFORMAT_COMBINATION_LIMIT = 100;
+
+    //!
+    //! \brief Return true if plugin supports the format and datatype for the input/output indexed by pos.
+    //!
+    //! For this method inputs are numbered 0..(nbInputs-1) and outputs are numbered nbInputs..(nbInputs+nbOutputs-1).
+    //! Using this numbering, pos is an index into InOut, where 0 <= pos < nbInputs+nbOutputs-1.
+    //!
+    //! TensorRT invokes this method to ask if the input/output indexed by pos supports the format/datatype specified
+    //! by inOut[pos].format and inOut[pos].type.  The override should return true if that format/datatype at inOut[pos]
+    //! are supported by the plugin.  If support is conditional on other input/output formats/datatypes, the plugin can
+    //! make its result conditional on the formats/datatypes in inOut[0..pos-1], which will be set to values
+    //! that the plugin supports.  The override should not inspect inOut[pos+1..nbInputs+nbOutputs-1],
+    //! which will have invalid values.  In other words, the decision for pos must be based on inOut[0..pos] only.
+    //!
+    //! Some examples:
+    //!
+    //! * A definition for a plugin that supports only FP16 NCHW:
+    //!
+    //!         return inOut.format[pos] == TensorFormat::kLINEAR && inOut.type[pos] == DataType::kHALF;
+    //!
+    //! * A definition for a plugin that supports only FP16 NCHW for its two inputs,
+    //!   and FP32 NCHW for its single output:
+    //!
+    //!         return inOut.format[pos] == TensorFormat::kLINEAR && (inOut.type[pos] == pos < 2 ?  DataType::kHALF : DataType::kFLOAT);
+    //!
+    //! * A definition for a "polymorphic" plugin with two inputs and one output that supports
+    //!   any format or type, but the inputs and output must have the same format and type:
+    //!
+    //!         return pos == 0 || (inOut.format[pos] == inOut.format[0] && inOut.type[pos] == inOut.type[0]);
+    //!
+    //! Warning: TensorRT will stop asking for formats once it finds kFORMAT_COMBINATION_LIMIT on combinations.
+    //!
+    virtual bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Configure the layer.
+    //!
+    //! This function is called by the builder prior to initialize().  It provides an opportunity for the layer to make
+    //! algorithm choices on the basis of bounds on the input and output tensors, and the target value.
+    //!
+    //! \param in The input tensors attributes that are used for configuration.
+    //! \param nbInputs Number of input tensors.
+    //! \param out The output tensors attributes that are used for configuration.
+    //! \param nbOutputs Number of output tensors.
+    //!
+    virtual void configurePlugin(const DynamicPluginTensorDesc* in, int nbInputs, const DynamicPluginTensorDesc* out, int nbOutputs) TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Find the workspace size required by the layer.
+    //!
+    //! This function is called after the plugin is configured, and possibly during execution.
+    //! The result should be a sufficient workspace size to deal with inputs and outputs of the given size
+    //! or any smaller problem.
+    //!
+    //! \return The workspace size.
+    //!
+    virtual size_t getWorkspaceSize(const PluginTensorDesc* inputs, int nbInputs, const PluginTensorDesc* outputs, int nbOutputs) const TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Execute the layer.
+    //!
+    //! \param inputDesc how to interpret the memory for the input tensors.
+    //! \param outputDesc how to interpret the memory for the output tensors.
+    //! \param inputs The memory for the input tensors.
+    //! \param outputs The memory for the output tensors.
+    //! \param workspace Workspace for execution.
+    //! \param stream The stream in which to execute the kernels.
+    //!
+    //! \return 0 for success, else non-zero (which will cause engine termination).
+    //!
+    virtual int enqueue(const PluginTensorDesc* inputDesc, const PluginTensorDesc* outputDesc, const void* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) TRTNOEXCEPT = 0;
+
+protected:
+    int getTensorRTVersion() const _TENSORRT_OVERRIDE TRTNOEXCEPT
+    {
+        return (static_cast<int>(PluginVersion::kV2_DYNAMICEXT) << 24 | (NV_TENSORRT_VERSION & 0xFFFFFF));
+    }
+
+    virtual ~IPluginV2DynamicExt() {}
+
+    // Rest of the methods below are obsolete inherited methods, and marked final when using a C++11 compiler.
+    // Derived classes should not override them.
+
+    //!
+    //! \brief Derived classes should not implement this. In a C++11 API it would be override final.
+    //!
+    //! Instead, derived classes should override the overload of getOutputDimensions that returns DimsExprs.
+    //!
+    TRT_DEPRECATED
+    Dims getOutputDimensions(int /*index*/, const Dims* /*inputs*/, int /*nbInputDims*/) _TENSORRT_FINAL TRTNOEXCEPT
+    {
+        Dims result;
+        result.nbDims = -1;
+        return result;
+    }
+
+    //!
+    //! \brief Derived classes should not implement this. In a C++11 API it would be override final.
+    //!
+    //! With dynamic shapes, there is no implicit batch dimension to broadcast across.
+    //!
+    TRT_DEPRECATED
+    bool isOutputBroadcastAcrossBatch(int /*outputIndex*/, const bool* /*inputIsBroadcasted*/, int /*nbInputs*/) const _TENSORRT_FINAL TRTNOEXCEPT
+    {
+        return false;
+    }
+
+    //!
+    //! \brief Derived classes should not implement this. In a C++11 API it would be override final.
+    //!
+    //! With dynamic shapes, there is no implicit batch dimension to broadcast across.
+    //!
+    TRT_DEPRECATED
+    bool canBroadcastInputAcrossBatch(int /*inputIndex*/) const _TENSORRT_FINAL TRTNOEXCEPT
+    {
+        return true;
+    }
+
+    //!
+    //! \brief Derived classes should not implement this. In a C++11 API it would be override final.
+    //!
+    //! This method is not used because it does not allow a plugin to specify mixed formats.
+    //!
+    //! Instead, derived classes should override supportsFormatCombination, which allows plugins
+    //! to express mixed formats.
+    //!
+    TRT_DEPRECATED
+    bool supportsFormat(DataType /*type*/, PluginFormat /*format*/) const _TENSORRT_FINAL TRTNOEXCEPT { return false; }
+
+    //!
+    //! \brief Derived classes should not implement this. In a C++11 API it would be override final.
+    //!
+    //! This method is not used because tensors with dynamic shapes do not have an implicit batch dimension,
+    //! input dimensions might be variable, and outputs might have different floating-point formats..
+    //!
+    //! Instead, derived classes should override the overload of configurePlugin that takes poiners to DynamicPluginTensorDesc.
+    //!
+    TRT_DEPRECATED
+    void configurePlugin(const Dims* /*inputDims*/, int /*nbInputs*/, const Dims* /*outputDims*/,
+                         int /*nbOutputs*/, const DataType* /*inputTypes*/, const DataType* /*outputTypes*/,
+                         const bool* /*inputIsBroadcast*/, const bool* /*outputIsBroadcast*/, PluginFormat /*floatFormat*/, int /*maxBatchSize*/) _TENSORRT_FINAL TRTNOEXCEPT {}
+
+    //!
+    //! \brief Derived classes should not implement this. In a C++11 API it would be override final.
+    //!
+    //! This method is not used because tensors with dynamic shapes do not have an implicit batch dimension,
+    //! and the other dimensions might not be build-time constants.
+    //!
+    //! Instead, derived classes should override the overload of getWorkspaceSize that takes pointers to PluginTensorDesc.
+    //! The arguments to that overload provide maximum bounds on all dimensions.
+    //!
+    TRT_DEPRECATED
+    size_t getWorkspaceSize(int /*maxBatchSize*/) const _TENSORRT_FINAL TRTNOEXCEPT { return 0; }
+
+    //!
+    //! \brief Derived classes should not implement this. In a C++11 API it would be override final.
+    //!
+    //! This method is not used because tensors with dynamic shapes can have different sizes in different execution contexts.
+    //!
+    //! Instead, derived classes should override the overload of enqueue that takes pointers to PluginTensorDesc.
+    //!
+    TRT_DEPRECATED
+    int enqueue(int /*batchSize*/, const void* const* /*inputs*/, void** /*outputs*/, void* /*workspace*/, cudaStream_t /*stream*/) _TENSORRT_FINAL TRTNOEXCEPT
+    {
+        return 1;
+    }
+};
+
+//!
+//! \class IProfiler
+//!
+//! \brief Application-implemented interface for profiling.
+//!
+//! When this class is added to an execution context, the profiler will be called once per layer for each invocation of execute().
+//! Note that enqueue() does not currently support profiling.
+//!
+//! The profiler will only be called after execution is complete. It has a small impact on execution time.
+//!
+class IProfiler
+{
+public:
+    //!
+    //! \brief Layer time reporting callback.
+    //!
+    //! \param layerName The name of the layer, set when constructing the network definition.
+    //! \param ms The time in milliseconds to execute the layer.
+    //!
+    virtual void reportLayerTime(const char* layerName, float ms) TRTNOEXCEPT = 0;
+
+    virtual ~IProfiler() {}
+};
+
+//!
+//! \enum WeightsRole
+//! \brief How a layer uses particular Weights.
+//!
+//! The power weights of an IScaleLayer are omitted.  Refitting those is not supported.
+//!
+enum class WeightsRole : int
+{
+    kKERNEL = 0,   //!< kernel for IConvolutionLayer, IDeconvolutionLayer, or IFullyConnectedLayer
+    kBIAS = 1,     //!< bias for IConvolutionLayer, IDeconvolutionLayer, or IFullyConnectedLayer
+    kSHIFT = 2,    //!< shift part of IScaleLayer
+    kSCALE = 3,    //!< scale part of IScaleLayer
+    kCONSTANT = 4, //!< weights for IConstantLayer
+};
+
+template <>
+constexpr inline int EnumMax<WeightsRole>()
+{
+    return 5;
+} //!< Maximum number of elements in WeightsRole enum. \see WeightsRole
+
+//!
+//! \enum DeviceType
+//! \brief The device that this layer/network will execute on.
+//!
+//!
+enum class DeviceType : int
+{
+    kGPU, //!< GPU Device
+    kDLA, //!< DLA Core
+};
+template <>
+constexpr inline int EnumMax<DeviceType>()
+{
+    return 2;
+} //!< Maximum number of elements in DeviceType enum. \see DeviceType
+
+
+//!
+//! \class IRuntime
+//!
+//! \brief Allows a serialized functionally unsafe engine to be deserialized.
+//!
+//! \warning Do not inherit from this class, as doing so will break forward-compatibility of the API and ABI.
+//!
+class IRuntime
+{
+public:
+    //!
+    //! \brief Deserialize an engine from a stream.
+    //!
+    //! \param blob The memory that holds the serialized engine.
+    //! \param size The size of the memory.
+    //! \param pluginFactory The plugin factory, if any plugins are used by the network, otherwise nullptr.
+    //!
+    //! \return The engine, or nullptr if it could not be deserialized.
+    //!
+    virtual nvinfer1::ICudaEngine* deserializeCudaEngine(const void* blob, std::size_t size, IPluginFactory* pluginFactory) noexcept = 0;
+
+    //!
+    //! \brief Set the DLA core that the deserialized engine must execute on.
+    //! \param dlaCore The DLA core to execute the engine on (0 to N-1, where N is the maximum number of DLA's present on the device). Default value is 0.
+    //! \see getDLACore()
+    //!
+    virtual void setDLACore(int dlaCore) noexcept = 0;
+
+    //!
+    //! \brief Get the DLA core that the engine executes on.
+    //! \return If setDLACore is called, returns DLA core from 0 to N-1, else returns 0.
+    //!
+    virtual int getDLACore() const noexcept = 0;
+
+    //!
+    //! \brief Returns number of DLA hardware cores accessible.
+    //!
+    virtual int getNbDLACores() const noexcept = 0;
+
+    //!
+    //! \brief Destroy this object.
+    //!
+    virtual void destroy() noexcept = 0;
+
+protected:
+    virtual ~IRuntime() {}
+
+public:
+    //!
+    //! \brief Set the GPU allocator.
+    //! \param allocator Set the GPU allocator to be used by the runtime. All GPU memory acquired will use this allocator. If NULL is passed, the default allocator will be used.
+    //!
+    //! Default: uses cudaMalloc/cudaFree.
+    //!
+    //! If nullptr is passed, the default allocator will be used.
+    //!
+    virtual void setGpuAllocator(IGpuAllocator* allocator) noexcept = 0;
+
+    //!
+    //! \brief Set the ErrorRecorder for this interface
+    //!
+    //! Assigns the ErrorRecorder to this interface. The ErrorRecorder will track all errors during execution.
+    //! This function will call incRefCount of the registered ErrorRecorder at least once. Setting
+    //! recorder to nullptr unregisters the recorder with the interface, resulting in a call to decRefCount if
+    //! a recorder has been registered.
+    //!
+    //! \param recorder The error recorder to register with this interface.
+    //
+    //! \see getErrorRecorder
+    //!
+    virtual void setErrorRecorder(IErrorRecorder* recorder) noexcept = 0;
+
+    //!
+    //! \brief get the ErrorRecorder assigned to this interface.
+    //!
+    //! Retrieves the assigned error recorder object for the given class. A default error recorder does not exist,
+    //! so a nullptr will be returned if setErrorRecorder has not been called.
+    //!
+    //! \return A pointer to the IErrorRecorder object that has been registered.
+    //!
+    //! \see setErrorRecorder
+    //!
+    virtual IErrorRecorder* getErrorRecorder() const noexcept = 0;
+
+    //!
+    //! \breif Deserialize an engine from a stream when plugin factory is not used.
+    //!
+    //! \param blob The memory that holds the serialized engine.
+    //! \param size The size of the memory.
+    //!
+    //! \return The engine, or nullptr if it could not be deserialized.
+    //!
+    nvinfer1::ICudaEngine* deserializeCudaEngine(const void* blob, std::size_t size) noexcept
+    {
+        return deserializeCudaEngine(blob, size, nullptr);
+    }
+};
+
+//!
+//! \class IRefitter
+//!
+//! \brief Updates weights in an engine.
+//!
+//! \warning Do not inherit from this class, as doing so will break forward-compatibility of the API and ABI.
+//!
+class IRefitter
+{
+public:
+    //!
+    //! \brief Specify new weights for a layer of given name.
+    //! Returns true on success, or false if new weights are rejected.
+    //! Possible reasons for rejection are:
+    //!
+    //! * There is no such layer by that name.
+    //! * The layer does not have weights with the specified role.
+    //! * The number of weights is inconsistent with the layer’s original specification.
+    //!
+    //! Modifying the weights before method refit() completes will result in undefined behavior.
+    virtual bool setWeights(const char* layerName, WeightsRole role, Weights weights) TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Updates associated engine.  Return true if successful.
+    //!
+    //! Failure occurs if getMissing() != 0 before the call.
+    //!
+    virtual bool refitCudaEngine() TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Get description of missing weights.
+    //!
+    //! For example, if some Weights have been set, but the engine was optimized
+    //! in a way that combines weights, any unsupplied Weights in the combination
+    //! are considered missing.
+    //!
+    //! \param size The number of items that can be safely written to a non-null layerNames or roles.
+    //! \param layerNames Where to write the layer names.
+    //! \param roles Where to write the weights roles.
+    //!
+    //! \return The number of missing Weights.
+    //!
+    //! If layerNames!=nullptr, each written pointer points to a string owned by
+    //! the engine being refitted, and becomes invalid when the engine is destroyed.
+    //!
+    virtual int getMissing(int size, const char** layerNames, WeightsRole* roles) TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Get description of all weights that could be refit.
+    //!
+    //! \param size The number of items that can be safely written to a non-null layerNames or roles.
+    //! \param layerNames Where to write the layer names.
+    //! \param roles Where to write the weights roles.
+    //!
+    //! \return The number of Weights that could be refit.
+    //!
+    //! If layerNames!=nullptr, each written pointer points to a string owned by
+    //! the engine being refitted, and becomes invalid when the engine is destroyed.
+    //!
+    virtual int getAll(int size, const char** layerNames, WeightsRole* roles) TRTNOEXCEPT = 0;
+
+    virtual void destroy() TRTNOEXCEPT = 0;
+
+protected:
+    virtual ~IRefitter() {}
+
+public:
+    //!
+    //! Update dynamic range for a tensor.
+    //!
+    //! \param name of an ITensor used to construct the network.
+    //!
+    //! \return True if successful; false otherwise.
+    //!
+    //! Returns false if there is no Int8 engine tensor derived from
+    //! a network tensor of that name.  If successful, then getMissing
+    //! may report that some weights need to be supplied.
+    virtual bool setDynamicRange(const char* tensorName, float min, float max) TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Get minimum of dynamic range.
+    //!
+    //! \return Minimum of dynamic range.
+    //!
+    //! If the dynamic range was never set, returns the minimum computed during calibration.
+    //!
+    virtual float getDynamicRangeMin(const char* tensorName) const TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Get maximum of dynamic range.
+    //!
+    //! \return Maximum of dynamic range.
+    //!
+    //! If the dynamic range was never set, returns the maximum computed during calibration.
+    //!
+    virtual float getDynamicRangeMax(const char* tensorName) const TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Get names of all tensors that have refittable dynamic ranges.
+    //!
+    //! \param size The number of items that can be safely written to a non-null tensorNames.
+    //! \param tensorNames Where to write the layer names.
+    //!
+    //! \return The number of Weights that could be refit.
+    //!
+    //! If tensorNames!=nullptr, each written pointer points to a string owned by
+    //! the engine being refitted, and becomes invalid when the engine is destroyed.
+    //!
+    virtual int getTensorsWithDynamicRange(int size, const char** tensorNames) const TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Set the ErrorRecorder for this interface
+    //!
+    //! Assigns the ErrorRecorder to this interface. The ErrorRecorder will track all errors during execution.
+    //! This function will call incRefCount of the registered ErrorRecorder at least once. Setting
+    //! recorder to nullptr unregisters the recorder with the interface, resulting in a call to decRefCount if
+    //! a recorder has been registered.
+    //!
+    //! \param recorder The error recorder to register with this interface.
+    //
+    //! \see getErrorRecorder
+    //!
+    virtual void setErrorRecorder(IErrorRecorder* recorder) TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief get the ErrorRecorder assigned to this interface.
+    //!
+    //! Retrieves the assigned error recorder object for the given class. A default error recorder does not exist,
+    //! so a nullptr will be returned if setErrorRecorder has not been called.
+    //!
+    //! \return A pointer to the IErrorRecorder object that has been registered.
+    //!
+    //! \see setErrorRecorder
+    //!
+    virtual IErrorRecorder* getErrorRecorder() const TRTNOEXCEPT = 0;
+};
+
+//!
+//! \class IPluginFactory
+//!
+//! \brief Plugin factory for deserialization.
+//!
+//! This Interface is guaranteed not to change for the same major version of TensorRT.
+class IPluginFactory
+{
+public:
+    //!
+    //! \brief Create a plugin from serialized data.
+    //!
+    //! Responsibility of destroying this plugin lies with the application.
+    //! It can be done anytime after consumers of this plugin are destroyed.
+    //!
+    //! \param layerName The name of the layer.
+    //! \param serialData The serialized data.
+    //! \param serialLength The length of the serialized data.
+    //!
+    //! \return The plugin.
+    //!
+    //! \see IPlugin::serialize()
+    //!
+    virtual IPlugin* createPlugin(const char* layerName, const void* serialData, size_t serialLength) TRTNOEXCEPT = 0;
+};
+
+//!
+//! \enum OptProfileSelector
+//!
+//! \brief When setting or querying optimization profile parameters (such as shape tensor inputs or dynamic dimensions),
+//!        select whether we are interested in the minimum, optimum, or maximum values for these parameters.
+//!        The minimum and maximum specify the permitted range that is supported at runtime, while the optimum value
+//!        is used for the kernel selection. This should be the "typical" value that is expected to occur at runtime.
+//!
+//! \see IOptimizationProfile::setDimensions(), IOptimizationProfile::setShapeValues()
+//!
+enum class OptProfileSelector : int
+{
+    kMIN = 0, //!< This is used to set or get the minimum permitted value for dynamic dimensions etc.
+    kOPT = 1, //!< This is used to set or get the value that is used in the optimization (kernel selection).
+    kMAX = 2  //!< This is used to set or get the maximum permitted value for dynamic dimensions etc.
+};
+
+template <>
+constexpr inline int EnumMax<OptProfileSelector>()
+{
+    return 3;
+} //!< Number of different values of OptProfileSelector enum. \see OptProfileSelector
+
+//!
+//! \class IOptimizationProfile
+//! \brief Optimization profile for dynamic input dimensions and shape tensors.
+//!
+//! When building an ICudaEngine from an INetworkDefinition that has dynamically resizable inputs (at least
+//! one input tensor has one or more of its dimensions specified as -1) or shape input tensors, users need to specify
+//! at least one optimization profile. Optimization profiles are numbered 0, 1, ...
+//! The first optimization profile that has been defined (with index 0) will be used by the ICudaEngine whenever no
+//! optimization profile has been selected explicitly. If none of the inputs are dynamic, the default optimization
+//! profile will be generated automatically unless it is explicitly provided by the user (this is possible but not
+//! required in this case). If more than a single optimization profile is defined, users may set a target how
+//! much additional weight space should be maximally allocated to each additional profile (as a fraction of the
+//! maximum, unconstrained memory).
+//!
+//! Users set optimum input tensor dimensions, as well as minimum and maximum input tensor dimensions. The builder
+//! selects the kernels that result in the lowest runtime for the optimum input tensor dimensions, and are valid for
+//! all input tensor sizes in the valid range between minimum and maximum dimensions. A runtime error will be raised
+//! if the input tensor dimensions fall outside the valid range for this profile. Likewise, users provide minimum,
+//! optimum, and maximum values for all shape tensor input values.
+//!
+//! \see IBuilderConfig::addOptimizationProfile()
+//!
+class IOptimizationProfile
+{
+public:
+    //!
+    //! \brief Set the minimum / optimum / maximum dimensions for a dynamic input tensor.
+    //!
+    //! This function must be called three times (for the minimum, optimum, and maximum) for any network input tensor
+    //! that has dynamic dimensions. If minDims, optDims, and maxDims are the minimum, optimum, and maximum dimensions,
+    //! and networkDims are the dimensions for this input tensor that are provided to the INetworkDefinition object,
+    //! then the following conditions must all hold:
+    //!
+    //! (1) minDims.nbDims == optDims.nbDims == maxDims.nbDims == networkDims.nbDims
+    //! (2) 1 <= minDims.d[i] <= optDims.d[i] <= maxDims.d[i] for i = 0, ..., networkDims.nbDims-1
+    //! (3) if networkDims.d[i] != -1, then minDims.d[i] == optDims.d[i] == maxDims.d[i] == networkDims.d[i]
+    //!
+    //! This function may (but need not be) called for an input tensor that does not have dynamic dimensions. In this
+    //! case, the third argument must always equal networkDims.
+    //!
+    //! \param inputName The input tensor name
+    //! \param select Whether to set the minimum, optimum, or maximum dimensions
+    //! \param dims The minimum, optimum, or maximum dimensions for this input tensor
+    //!
+    //! \return false if an inconsistency was detected (e.g. the rank does not match another dimension that was
+    //!         previously set for the same input), true if no inconsistency was detected. Note that inputs can be
+    //!         validated only partially; a full validation is performed at engine build time.
+    //!
+    virtual bool setDimensions(const char* inputName, OptProfileSelector select, Dims dims) noexcept = 0;
+
+    //!
+    //! \brief Get the minimum / optimum / maximum dimensions for a dynamic input tensor.
+    //!
+    //! If the dimensions have not been previously set via setDimensions(), return an invalid Dims with nbDims == -1.
+    //!
+    virtual Dims getDimensions(const char* inputName, OptProfileSelector select) const noexcept = 0;
+
+    //!
+    //! \brief Set the minimum / optimum / maximum values for an input shape tensor.
+    //!
+    //! This function must be called three times for every input tensor t that is a shape tensor (t.isShape() == true).
+    //! This implies that the datatype of t is DataType::kINT32, the rank is either 0 or 1, and the dimensions of t
+    //! are fixed at network definition time. This function must not be called for any input tensor that is not a
+    //! shape tensor.
+    //! Each time this function is called for the same input tensor, the same nbValues must be supplied (either 1
+    //! if the tensor rank is 0, or dims.d[0] if the rank is 1). Furthermore, if minVals, optVals, maxVals are the
+    //! minimum, optimum, and maximum values, it must be true that minVals[i] <= optVals[i] <= maxVals[i] for
+    //! i = 0, ..., nbValues - 1.
+    //!
+    //! \param inputName The input tensor name
+    //! \param select Whether to set the minimum, optimum, or maximum input values.
+    //! \param values An array of length nbValues containing the minimum, optimum, or maximum shape tensor elements.
+    //! \param nbValues The length of the value array, which must equal the number of shape tensor elements (>= 1)
+    //!
+    //! \return false if an inconsistency was detected (e.g. nbValues does not match a previous call for the same
+    //!         tensor), else true. As for setDimensions(), a full validation can only be performed at engine build
+    //!         time.
+    //!
+    virtual bool setShapeValues(
+        const char* inputName, OptProfileSelector select, const int32_t* values, int nbValues) noexcept = 0;
+
+    //!
+    //! \brief Get the number of values for an input shape tensor.
+    //!
+    //! This will return the number of shape values if setShapeValues() has been called before for this input tensor.
+    //! Otherwise, return -1.
+    //!
+    virtual int getNbShapeValues(const char* inputName) const noexcept = 0;
+
+    //!
+    //! \brief Get the minimum / optimum / maximum values for an input shape tensor.
+    //!
+    //! If the shape values have not been set previously with setShapeValues(), this returns nullptr.
+    //!
+    virtual const int32_t* getShapeValues(const char* inputName, OptProfileSelector select) const noexcept = 0;
+
+    //!
+    //! \brief Set a target for extra GPU memory that may be used by this profile.
+    //!
+    //! \param target Additional memory that the builder should aim to maximally allocate for this profile, as a
+    //!        fraction of the memory it would use if the user did not impose any constraints on memory. This
+    //!        unconstrained case is the default; it corresponds to target == 1.0. If target == 0.0, the builder
+    //!        aims to create the new optimization profile without allocating any additional weight memory.
+    //!        Valid inputs lie between 0.0 and 1.0. This parameter is only a hint, and TensorRT does not guarantee
+    //!        that the target will be reached. This parameter is ignored for the first (default) optimization profile
+    //!        that is defined.
+    //!
+    //! \return true if the input is in the valid range (between 0 and 1 inclusive), else false
+    //!
+    virtual bool setExtraMemoryTarget(float target) noexcept = 0;
+
+    //!
+    //! \brief Get the extra memory target that has been defined for this profile.
+    //!
+    virtual float getExtraMemoryTarget() const noexcept = 0;
+
+    //!
+    //! \brief Check whether the optimization profile can be passed to an IBuilderConfig object.
+    //!
+    //! This function performs partial validation, by e.g. checking that whenever one of the minimum, optimum, or
+    //! maximum dimensions of a tensor have been set, the others have also been set and have the same rank, as
+    //! well as checking that the optimum dimensions are always as least as large as the minimum dimensions, and
+    //! that the maximum dimensions are at least as large as the optimum dimensions. Some validation steps require
+    //! knowledge of the network definition and are deferred to engine build time.
+    //!
+    //! \return true if the optimization profile is valid and may be passed to an IBuilderConfig, else false
+    //!
+    virtual bool isValid() const noexcept = 0;
+
+protected:
+    ~IOptimizationProfile() noexcept = default;
+};
+
+//!
+//! \class ICudaEngine
+//!
+//! \brief An engine for executing inference on a built network, with functionally unsafe features.
+//!
+//! \warning Do not inherit from this class, as doing so will break forward-compatibility of the API and ABI.
+//!
+class ICudaEngine
+{
+public:
+    //!
+    //! \brief Get the number of binding indices. 
+    //!
+    //! If the engine has been built for K profiles, the first getNbBindings() / K bindings are used by profile
+    //! number 0, the following getNbBindings() / K bindings are used by profile number 1 etc.
+    //!
+    //! \see getBindingIndex();
+    //!
+    virtual int getNbBindings() const noexcept = 0;
+
+    //!
+    //! \brief Retrieve the binding index for a named tensor.
+    //!
+    //! IExecutionContext::enqueue() and IExecutionContext::execute() require an array of buffers.
+    //!
+    //! Engine bindings map from tensor names to indices in this array.
+    //! Binding indices are assigned at engine build time, and take values in the range [0 ... n-1] where n is the total number of inputs and outputs.
+    //!
+    //! \param name The tensor name.
+    //! \return The binding index for the named tensor, or -1 if the name is not found.
+    //!
+    //! see getNbBindings() getBindingIndex()
+    //!
+    virtual int getBindingIndex(const char* name) const noexcept = 0;
+
+    //!
+    //! \brief Retrieve the name corresponding to a binding index.
+    //!
+    //! This is the reverse mapping to that provided by getBindingIndex().
+    //!
+    //! \param bindingIndex The binding index.
+    //! \return The name corresponding to the index, or nullptr if the index is out of range.
+    //!
+    //! \see getBindingIndex()
+    //!
+    virtual const char* getBindingName(int bindingIndex) const noexcept = 0;
+
+    //!
+    //! \brief Determine whether a binding is an input binding.
+    //!
+    //! \param bindingIndex The binding index.
+    //! \return True if the index corresponds to an input binding and the index is in range.
+    //!
+    //! \see getBindingIndex()
+    //!
+    virtual bool bindingIsInput(int bindingIndex) const noexcept = 0;
+
+    //!
+    //! \brief Get the dimensions of a binding.
+    //!
+    //! \param bindingIndex The binding index.
+    //! \return The dimensions of the binding if the index is in range, otherwise Dims()
+    //!         Has -1 for any dimension with a dynamic value.
+    //!
+    //! \see getBindingIndex()
+    //!
+    virtual Dims getBindingDimensions(int bindingIndex) const noexcept = 0;
+
+    //!
+    //! \brief Determine the required data type for a buffer from its binding index.
+    //!
+    //! \param bindingIndex The binding index.
+    //! \return The type of the data in the buffer.
+    //!
+    //! \see getBindingIndex()
+    //!
+    virtual DataType getBindingDataType(int bindingIndex) const noexcept = 0;
+
+    //!
+    //! \brief Get the maximum batch size which can be used for inference.
+    //!
+    //! \return The maximum batch size for this engine.
+    //!
+    virtual int getMaxBatchSize() const noexcept = 0;
+
+    //!
+    //! \brief Get the number of layers in the network.
+    //!
+    //! The number of layers in the network is not necessarily the number in the original network definition, as layers may be combined or eliminated as the engine is
+    //! optimized. This value can be useful when building per-layer tables, such as when aggregating profiling data over a number of executions.
+    //!
+    //! \return The number of layers in the network.
+    //!
+    virtual int getNbLayers() const noexcept = 0;
+
+    //!
+    //! \brief Get the amount of workspace the engine uses.
+    //!
+    //! The workspace size will be no greater than the value provided to the builder when the engine was built, and will typically be smaller.
+    //! Workspace will be allocated for each execution context.
+    //!
+    TRT_DEPRECATED
+    virtual std::size_t getWorkspaceSize() const noexcept = 0;
+
+    //!
+    //! \brief Serialize the network to a stream.
+    //!
+    //! \return A IHostMemory object that contains the serialized engine.
+    //!
+    //! The network may be deserialized with IRuntime::deserializeCudaEngine() and also safe::IRuntime::deserializeCudaEngine() if only functional-safe features are used in the engine.
+    //!
+    //! \see IRuntime::deserializeCudaEngine() safe::IRuntime::deserializeCudaEngine()
+    //!
+    virtual IHostMemory* serialize() const noexcept = 0;
+
+    //!
+    //! \brief Create an execution context.
+    //!
+    //! \see IExecutionContext.
+    //!
+    virtual IExecutionContext* createExecutionContext() noexcept = 0;
+
+    //!
+    //! \brief Destroy this object;
+    //!
+    virtual void destroy() noexcept = 0;
+
+    //!
+    //! \brief Get location of binding
+    //!
+    //! This lets you know whether the binding should be a pointer to device or host memory.
+    //!
+    //! \see ITensor::setLocation() ITensor::getLocation()
+    //!
+    //! \param bindingIndex The binding index.
+    //! \return The location of the bound tensor with given index.
+    //!
+    virtual TensorLocation getLocation(int bindingIndex) const noexcept = 0;
+
+protected:
+    virtual ~ICudaEngine() {}
+
+public:
+    //! \brief create an execution context without any device memory allocated
+    //!
+    //! The memory for execution of this device context must be supplied by the application.
+    //!
+    //! \see getDeviceMemorySize() IExecutionContext::setDeviceMemory()
+    //!
+    virtual IExecutionContext* createExecutionContextWithoutDeviceMemory() noexcept = 0;
+
+    //!
+    //! \brief Return the amount of device memory required by an execution context.
+    //!
+    //! \see IExecutionContext::setDeviceMemory()
+    //!
+    virtual size_t getDeviceMemorySize() const noexcept = 0;
+
+    //!
+    //! \brief Return true if engine can be refit.
+    //!
+    //! \see nvinfer1::createInferRefitter()
+    //!
+    virtual bool isRefittable() const noexcept = 0;
+
+    //!
+    //! \brief Return the number of bytes per component of an element.
+    //!
+    //! The vector component size is returned if getBindingVectorizedDim() != -1.
+    //!
+    //! \param bindingIndex The binding Index.
+    //!
+    //! \see ICudaEngine::getBindingVectorizedDim()
+    //!
+    virtual int getBindingBytesPerComponent(int bindingIndex) const noexcept = 0;
+
+    //!
+    //! \brief Return the number of components included in one element.
+    //!
+    //! The number of elements in the vectors is returned if getBindingVectorizedDim() != -1.
+    //!
+    //! \param bindingIndex The binding Index.
+    //!
+    //! \see ICudaEngine::getBindingVectorizedDim()
+    //!
+    virtual int getBindingComponentsPerElement(int bindingIndex) const noexcept = 0;
+
+    //!
+    //! \brief Return the binding format.
+    //!
+    //! \param bindingIndex The binding Index.
+    //!
+    virtual TensorFormat getBindingFormat(int bindingIndex) const noexcept = 0;
+
+    //!
+    //! \brief Return the human readable description of the tensor format.
+    //!
+    //! The description includes the order, vectorization, data type, strides,
+    //! and etc. Examples are shown as follows:
+    //!   Example 1: kCHW + FP32
+    //!     "Row major linear FP32 format"
+    //!   Example 2: kCHW2 + FP16
+    //!     "Two wide channel vectorized row major FP16 format"
+    //!   Example 3: kHWC8 + FP16 + Line Stride = 32
+    //!     "Channel major FP16 format where C % 8 == 0 and H Stride % 32 == 0"
+    //!
+    //! \param bindingIndex The binding Index.
+    //!
+    virtual const char* getBindingFormatDesc(int bindingIndex) const noexcept = 0;
+
+    //!
+    //! \brief Return the dimension index that the buffer is vectorized.
+    //!
+    //! Specifically -1 is returned if scalars per vector is 1.
+    //!
+    //! \param bindingIndex The binding Index.
+    //!
+    virtual int getBindingVectorizedDim(int bindingIndex) const noexcept = 0;
+
+    //!
+    //! \brief Returns the name of the network associated with the engine.
+    //!
+    //! The name is set during network creation and is retrieved after
+    //! building or deserialization.
+    //!
+    //! \see INetworkDefinition::setName(), INetworkDefinition::getName()
+    //!
+    //! \return A zero delimited C-style string representing the name of the network.
+    //!
+    virtual const char* getName() const noexcept = 0;
+
+    //!
+    //! \brief Get the number of optimization profiles defined for this engine.
+    //!
+    //! \return Number of optimization profiles. It is always at least 1.
+    //!
+    //! \see IExecutionContext::setOptimizationProfile()
+    virtual int getNbOptimizationProfiles() const noexcept = 0;
+
+    //!
+    //! \brief Get the minimum / optimum / maximum dimensions for a particular binding under an optimization profile.
+    //!
+    //! \param bindingIndex The binding index (must be between 0 and getNbBindings() - 1)
+    //!
+    //! \param profileIndex The profile index (must be between 0 and getNbOptimizationProfiles()-1)
+    //!
+    //! \param select Whether to query the minimum, optimum, or maximum dimensions for this binding.
+    //!
+    //! \return The minimum / optimum / maximum dimensions for this binding in this profile.
+    virtual Dims getProfileDimensions(int bindingIndex, int profileIndex, OptProfileSelector select) const noexcept = 0;
+
+    //!
+    //! \brief Get minimum / optimum / maximum values for an input shape binding under an optimization profile.
+    //!
+    //! \param profileIndex The profile index (must be between 0 and getNbOptimizationProfiles()-1)
+    //!
+    //! \param inputIndex The input index (must be between 0 and getNbBindings() - 1)
+    //!
+    //! \param select Whether to query the minimum, optimum, or maximum shape values for this binding.
+    //!
+    //! \return If the binding is an input shape binding, return a pointer to an array that has
+    //!         the same number of elements as the corresponding tensor, i.e. 1 if dims.nbDims == 0, or dims.d[0]
+    //!         if dims.nbDims == 1, where dims = getBindingDimensions(inputIndex). The array contains
+    //!         the elementwise minimum / optimum / maximum values for this shape binding under the profile.
+    //!         If either of the indices is out of range, or if the binding is not an input shape binding, return
+    //!         nullptr.
+    virtual const int32_t* getProfileShapeValues(int profileIndex, int inputIndex, OptProfileSelector select) const
+        noexcept
+        = 0;
+
+    //!
+    //! \brief True if tensor is required as input for shape calculations or output from them.
+    //!
+    //! TensorRT evaluates a network in two phases:
+    //!
+    //! 1. Compute shape information required to determine memory allocation requirements
+    //!    and validate that runtime sizes make sense.
+    //!
+    //! 2. Process tensors on the device.
+    //!
+    //! Some tensors are required in phase 1.  These tensors are called "shape tensors", and always
+    //! have type Int32 and no more than one dimension.  These tensors are not always shapes
+    //! themselves, but might be used to calculate tensor shapes for phase 2.
+    //!
+    //! isShapeBinding(i) returns true if the tensor is a required input or an output computed in phase 1.
+    //! isExecutionBinding(i) returns true if the tensor is a required input or an output computed in phase 2.
+    //!
+    //! For example, if a network uses an input tensor with binding i as an addend
+    //! to an IElementWiseLayer that computes the "reshape dimensions" for IShuffleLayer,
+    //! then isShapeBinding(i) == true.
+    //!
+    //! It's possible to have a tensor be required by both phases.  For instance, a tensor
+    //! can be used for the "reshape dimensions" and as the indices for an IGatherLayer
+    //! collecting floating-point data.
+    //!
+    //! It's also possible to have a tensor be required by neither phase, but nonetheless
+    //! shows up in the engine's inputs.  For example, if an input tensor is used only
+    //! as an input to IShapeLayer, only its shape matters and its values are irrelevant.
+    //!
+    //! \see isExecutionBinding()
+    //!
+    virtual bool isShapeBinding(int bindingIndex) const noexcept = 0;
+
+    //!
+    //! \brief True if pointer to tensor data is required for execution phase, false if nullptr can be supplied.
+    //!
+    //! For example, if a network uses an input tensor with binding i ONLY as the "reshape dimensions"
+    //! input of IShuffleLayer, then isExecutionBinding(i) is false, and a nullptr can be
+    //! supplied for it when calling IExecutionContext::execute or IExecutionContext::enqueue.
+    //!
+    //! \see isShapeBinding()
+    //!
+    virtual bool isExecutionBinding(int bindingIndex) const noexcept = 0;
+
+    //!
+    //! \brief determine that execution capability this engine has.
+    //!
+    //! If the engine has EngineCapability::kDEFAULT, then all engine functionality is valid..
+    //! If the engine has EngineCapability::kSAFE_GPU, then only the functionality in safe::ICudaEngine is valid.
+    //! If the engine has EngineCapability::kSAFE_DLA, then only serialize, destroy, and const-accessor functions are valid.
+    //!
+    //! \return The EngineCapability flag that the engine was built for.
+    //!
+    virtual EngineCapability getEngineCapability() const noexcept = 0;
+
+    //! \brief Set the ErrorRecorder for this interface
+    //!
+    //! Assigns the ErrorRecorder to this interface. The ErrorRecorder will track all errors during execution.
+    //! This function will call incRefCount of the registered ErrorRecorder at least once. Setting
+    //! recorder to nullptr unregisters the recorder with the interface, resulting in a call to decRefCount if
+    //! a recorder has been registered.
+    //!
+    //! \param recorder The error recorder to register with this interface.
+    //
+    //! \see getErrorRecorder
+    //!
+    virtual void setErrorRecorder(IErrorRecorder* recorder) noexcept = 0;
+
+    //!
+    //! \brief get the ErrorRecorder assigned to this interface.
+    //!
+    //! Retrieves the assigned error recorder object for the given class. A default error recorder does not exist,
+    //! so a nullptr will be returned if setErrorRecorder has not been called.
+    //!
+    //! \return A pointer to the IErrorRecorder object that has been registered.
+    //!
+    //! \see setErrorRecorder
+    //!
+    virtual IErrorRecorder* getErrorRecorder() const noexcept = 0;
+};
+
+//!
+//! \class IExecutionContext
+//!
+//! \brief Context for executing inference using an engine, with functionally unsafe features.
+//!
+//! Multiple execution contexts may exist for one ICudaEngine instance, allowing the same
+//! engine to be used for the execution of multiple batches simultaneously. If the engine supports
+//! dynamic shapes, each execution context in concurrent use must use a separate optimization profile.
+//!
+//! \warning Do not inherit from this class, as doing so will break forward-compatibility of the API and ABI.
+class IExecutionContext
+{
+public:
+    //!
+    //! \brief Synchronously execute inference on a batch.
+    //!
+    //! This method requires an array of input and output buffers. The mapping from tensor names to indices can be queried using ICudaEngine::getBindingIndex()
+    //! \param batchSize The batch size. This is at most the value supplied when the engine was built.
+    //! \param bindings An array of pointers to input and output buffers for the network.
+    //!
+    //! \return True if execution succeeded.
+    //!
+    //! \see ICudaEngine::getBindingIndex() ICudaEngine::getMaxBatchSize()
+    //!
+    virtual bool execute(int batchSize, void** bindings) noexcept = 0;
+
+    //!
+    //! \brief Asynchronously execute inference on a batch.
+    //!
+    //! This method requires an array of input and output buffers. The mapping from tensor names to indices can be queried using ICudaEngine::getBindingIndex()
+    //! \param batchSize The batch size. This is at most the value supplied when the engine was built.
+    //! \param bindings An array of pointers to input and output buffers for the network.
+    //! \param stream A cuda stream on which the inference kernels will be enqueued
+    //! \param inputConsumed An optional event which will be signaled when the input buffers can be refilled with new data
+    //!
+    //! \return True if the kernels were enqueued successfully.
+    //!
+    //! \see ICudaEngine::getBindingIndex() ICudaEngine::getMaxBatchSize()
+    //!
+    virtual bool enqueue(int batchSize, void** bindings, cudaStream_t stream, cudaEvent_t* inputConsumed) noexcept = 0;
+
+    //!
+    //! \brief Set the debug sync flag.
+    //!
+    //! If this flag is set to true, the engine will log the successful execution for each kernel during execute(). It has no effect when using enqueue().
+    //!
+    //! \see getDebugSync()
+    //!
+    virtual void setDebugSync(bool sync) noexcept = 0;
+
+    //!
+    //! \brief Get the debug sync flag.
+    //!
+    //! \see setDebugSync()
+    //!
+    virtual bool getDebugSync() const noexcept = 0;
+
+    //!
+    //! \brief Set the profiler.
+    //!
+    //! \see IProfiler getProfiler()
+    //!
+    virtual void setProfiler(IProfiler*) noexcept = 0;
+
+    //!
+    //! \brief Get the profiler.
+    //!
+    //! \see IProfiler setProfiler()
+    //!
+    virtual IProfiler* getProfiler() const noexcept = 0;
+
+    //!
+    //! \brief Get the associated engine.
+    //!
+    //! \see ICudaEngine
+    //!
+    virtual const ICudaEngine& getEngine() const noexcept = 0;
+
+    //!
+    //! \brief Destroy this object.
+    //!
+    virtual void destroy() noexcept = 0;
+
+protected:
+    virtual ~IExecutionContext() noexcept {}
+
+public:
+    //!
+    //! \brief Set the name of the execution context.
+    //!
+    //! This method copies the name string.
+    //!
+    //! \see getName()
+    //!
+    virtual void setName(const char* name) noexcept = 0;
+
+    //!
+    //! \brief Return the name of the execution context.
+    //!
+    //! \see setName()
+    //!
+    virtual const char* getName() const noexcept = 0;
+
+    //!
+    //! \brief set the device memory for use by this execution context.
+    //!
+    //! The memory must be aligned with cuda memory alignment property (using cudaGetDeviceProperties()), and its size must be at least that
+    //! returned by getDeviceMemorySize(). If using enqueue() to run the network, The memory is in
+    //! use from the invocation of enqueue() until network execution is complete. If using execute(),
+    //! it is in use until execute() returns. Releasing or otherwise using the memory for other
+    //! purposes during this time will result in undefined behavior.
+    //!
+    //! \see ICudaEngine::getDeviceMemorySize() ICudaEngine::createExecutionContextWithoutDeviceMemory()
+    //!
+    virtual void setDeviceMemory(void* memory) noexcept = 0;
+
+    //!
+    //! \brief Return the strides of the buffer for the given binding.
+    //!
+    //! Note that strides can be different for different execution contexts
+    //! with dynamic shapes.
+    //!
+    //! \param bindingIndex The binding index.
+    //!
+    virtual Dims getStrides(int bindingIndex) const noexcept = 0;
+
+public:
+    //!
+    //! \brief Select an optimization profile for the current context.
+    //!
+    //! \param profileIndex Index of the profile. It must lie between 0 and
+    //!        getEngine().getNbOptimizationProfiles() - 1
+    //!
+    //! The selected profile will be used in subsequent calls to execute() or enqueue().
+    //!
+    //! If the associated CUDA engine has dynamic inputs, this method must be called exactly once
+    //! with a unique profileIndex before calling execute or enqueue (i.e. the profile index
+    //! may not be in use by another execution context that has not been destroyed yet). Once the
+    //! optimization profile has been set (getOptimizationProfile() != -1), it cannot be changed.
+    //! For the first execution context that is created for an engine, setOptimizationProfile(0)
+    //! is called implicitly. This means users only ever need to call this method if they need more
+    //! than a single execution context. In this case, profileIdx must be nonzero and unique for
+    //! all execution contexts that are created after the first.
+    //!
+    //! If the associated CUDA engine has not dynamic inputs, this method need not be
+    //! called, in which case the default profile index of 0 will be used (this is particularly
+    //! the case for all safe engines).
+    //!
+    //! setOptimizationProfile() must be called before calling setBindingDimensions() and 
+    //! setInputShapeBinding() for all dynamic input tensors or input shape tensors, which in 
+    //! turn must be called before either execute() or enqueue().
+    //!
+    //! \return true if the call succeeded, else false (e.g. input out of range)
+    //!
+    //! \see ICudaEngine::getNbOptimizationProfiles()
+    virtual bool setOptimizationProfile(int profileIndex) noexcept = 0;
+
+    //!
+    //! \brief Get the index of the currently selected optimization profile.
+    //!
+    //! If the profile index has not been set yet (implicitly to 0 for the first execution context
+    //! to be created, or explicitly for all subsequent contexts), an invalid value of -1 will be returned
+    //! and all calls to enqueue() or execute() will fail until a valid profile index has been set.
+    //!
+    virtual int getOptimizationProfile() const noexcept = 0;
+
+    //!
+    //! \brief Set the dynamic dimensions of a binding
+    //!
+    //! Requires the engine to be built without an implicit batch dimension.
+    //! The binding must be an input tensor, and all dimensions must be compatible with
+    //! the network definition (i.e. only the wildcard dimension -1 can be replaced with a
+    //! new dimension > 0). Furthermore, the dimensions must be in the valid range for the
+    //! currently selected optimization profile, and the corresponding engine must not be
+    //! safety-certified.
+    //! This method will fail unless a valid optimization profile is defined for the current
+    //! execution context (getOptimizationProfile() must not be -1).
+    //!
+    //! For all dynamic non-output bindings (which have at least one wildcard dimension of -1),
+    //! this method needs to be called before either enqueue() or execute() may be called.
+    //! This can be checked using the method allInputDimensionsSpecified().
+    //!
+    //! \return false if an error occurs (e.g. index out of range), else true
+    //!
+    virtual bool setBindingDimensions(int bindingIndex, Dims dimensions) noexcept = 0;
+
+    //!
+    //! \brief Get the dynamic dimensions of a binding
+    //!
+    //! If the engine was built with an implicit batch dimension, same as ICudaEngine::getBindingDimensions.
+    //!
+    //! If setBindingDimensions() has been called on this binding (or if there are no
+    //! dynamic dimensions), all dimensions will be positive. Otherwise, it is necessary to
+    //! call setBindingDimensions() before enqueue() or execute() may be called.
+    //!
+    //! If the bindingIndex is out of range, an invalid Dims with nbDims == -1 is returned.
+    //! The same invalid Dims will be returned if the engine was not built with an implicit
+    //! batch dimension and if the execution context is not currently associated with a valid
+    //! optimization profile (i.e. if getOptimizationProfile() returns -1).
+    //!
+    //! If ICudaEngine::bindingIsInput(bindingIndex) is false, then both
+    //! allInputDimensionsSpecified() and allInputShapesSpecified() must be true
+    //! before calling this method.
+    //!
+    //! \return Currently selected binding dimensions
+    //!
+    virtual Dims getBindingDimensions(int bindingIndex) const noexcept = 0;
+
+    //!
+    //! \brief Set values of input tensor required by shape calculations.
+    //!
+    //! \param bindingIndex index of an input tensor for which
+    //!        ICudaEngine::isShapeBinding(bindingIndex) and ICudaEngine::bindingIsInput(bindingIndex)
+    //!        are both true.
+    //!
+    //! \param data pointer to values of the input tensor.  The number of values should be
+    //!         the product of the dimensions returned by getBindingDimensions(bindingIndex).
+    //!
+    //! If ICudaEngine::isShapeBinding(bindingIndex) and ICudaEngine::bindingIsInput(bindingIndex)
+    //! are both true, this method must be called before enqueue() or execute() may be called.
+    //! This method will fail unless a valid optimization profile is defined for the current
+    //! execution context (getOptimizationProfile() must not be -1).
+    //!
+    virtual bool setInputShapeBinding(int bindingIndex, const int32_t* data) noexcept = 0;
+
+    //!
+    //! \brief Get values of an input tensor required for shape calculations or an output tensor produced by shape calculations.
+    //!
+    //! \param bindingIndex index of an input or output tensor for which
+    //!        ICudaEngine::isShapeBinding(bindingIndex) is true.
+    //!
+    //! \param data pointer to where values will be written.  The number of values written is
+    //!        the product of the dimensions returned by getBindingDimensions(bindingIndex).
+    //!
+    //! If ICudaEngine::bindingIsInput(bindingIndex) is false, then both
+    //! allInputDimensionsSpecified() and allInputShapesSpecified() must be true
+    //! before calling this method. The method will also fail if no valid optimization profile
+    //! has been set for the current execution context, i.e. if getOptimizationProfile() returns -1.
+    //!
+    //! \see isShapeBinding(bindingIndex)
+    //!
+    virtual bool getShapeBinding(int bindingIndex, int32_t* data) const noexcept = 0;
+
+    //!
+    //! \brief Whether all dynamic dimensions of input tensors have been specified
+    //!
+    //! \return True if all dynamic dimensions of input tensors have been specified
+    //!         by calling setBindingDimensions().
+    //!
+    //! Trivially true if network has no dynamically shaped input tensors.
+    //!
+    //! \see setBindingDimensions(bindingIndex,dimensions)
+    //!
+    virtual bool allInputDimensionsSpecified() const noexcept = 0;
+
+    //!
+    //! \brief Whether all input shape bindings have been specified
+    //!
+    //! \return True if all input shape bindings have been specified by setInputShapeBinding().
+    //!
+    //! Trivially true if network has no input shape bindings.
+    //!
+    //! \see isShapeBinding(bindingIndex)
+    //!
+    virtual bool allInputShapesSpecified() const noexcept = 0;
+
+    //!
+    //! \brief Set the ErrorRecorder for this interface
+    //!
+    //! Assigns the ErrorRecorder to this interface. The ErrorRecorder will track all errors during execution.
+    //! This function will call incRefCount of the registered ErrorRecorder at least once. Setting
+    //! recorder to nullptr unregisters the recorder with the interface, resulting in a call to decRefCount if
+    //! a recorder has been registered.
+    //!
+    //! \param recorder The error recorder to register with this interface.
+    //
+    //! \see getErrorRecorder
+    //!
+    virtual void setErrorRecorder(IErrorRecorder* recorder) noexcept = 0;
+
+    //!
+    //! \brief get the ErrorRecorder assigned to this interface.
+    //!
+    //! Retrieves the assigned error recorder object for the given class. A default error recorder does not exist,
+    //! so a nullptr will be returned if setErrorRecorder has not been called.
+    //!
+    //! \return A pointer to the IErrorRecorder object that has been registered.
+    //!
+    //! \see setErrorRecorder
+    //!
+    virtual IErrorRecorder* getErrorRecorder() const noexcept = 0;
+
+    //!
+    //! \brief Synchronously execute inference a network.
+    //!
+    //! This method requires an array of input and output buffers. The mapping from tensor names to indices can be
+    //! queried using ICudaEngine::getBindingIndex().
+    //! This method only works for execution contexts built with full dimension networks.
+    //! \param bindings An array of pointers to input and output buffers for the network.
+    //!
+    //! \return True if execution succeeded.
+    //!
+    //! \see ICudaEngine::getBindingIndex() ICudaEngine::getMaxBatchSize()
+    //!
+    virtual bool executeV2(void** bindings) noexcept = 0;
+
+    //!
+    //! \brief Asynchronously execute inference.
+    //!
+    //! This method requires an array of input and output buffers. The mapping from tensor names to indices can be
+    //! queried using ICudaEngine::getBindingIndex().
+    //! This method only works for execution contexts built with full dimension networks.
+    //! \param bindings An array of pointers to input and output buffers for the network.
+    //! \param stream A cuda stream on which the inference kernels will be enqueued
+    //! \param inputConsumed An optional event which will be signaled when the input buffers can be refilled with new
+    //! data
+    //!
+    //! \return True if the kernels were enqueued successfully.
+    //!
+    //! \see ICudaEngine::getBindingIndex() ICudaEngine::getMaxBatchSize()
+    //!
+    virtual bool enqueueV2(void** bindings, cudaStream_t stream, cudaEvent_t* inputConsumed) noexcept = 0;
+};
+}
+
+extern "C" TENSORRTAPI void* createInferRuntime_INTERNAL(void* logger, int version); //!< Internal C entry point for creating IRuntime.
+
+extern "C" TENSORRTAPI void* createInferRefitter_INTERNAL(void* engine, void* logger, int version); //!< Internal C entry point for creating IRefitter.
+
+namespace nvinfer1
+{
+namespace // unnamed namespace avoids linkage surprises when linking objects built with different versions of this header.
+{
+//!
+//! \brief Create an instance of an IRuntime class.
+//!
+//! This class is the logging class for the runtime.
+//!
+inline IRuntime* createInferRuntime(ILogger& logger)
+{
+    return static_cast<IRuntime*>(createInferRuntime_INTERNAL(&logger, NV_TENSORRT_VERSION));
+}
+//!
+//! \brief Create an instance of an IRefitter class.
+//!
+//! This class is the logging class for the refitter.
+//!
+inline IRefitter* createInferRefitter(ICudaEngine& engine, ILogger& logger)
+{
+    return static_cast<IRefitter*>(createInferRefitter_INTERNAL(&engine, &logger, NV_TENSORRT_VERSION));
+}
+}
+}
+
+#endif // NV_INFER_RUNTIME_H
diff --git a/include/NvInferRuntimeCommon.h b/include/NvInferRuntimeCommon.h
new file mode 100644
index 00000000..6d3308e6
--- /dev/null
+++ b/include/NvInferRuntimeCommon.h
@@ -0,0 +1,1281 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef NV_INFER_RUNTIME_COMMON_H
+#define NV_INFER_RUNTIME_COMMON_H
+
+#include <cstddef>
+#include <cstdint>
+#include "NvInferVersion.h"
+
+#if __cplusplus > 201103L
+#define _TENSORRT_FINAL final
+#define _TENSORRT_OVERRIDE override
+#else
+#define _TENSORRT_FINAL
+#define _TENSORRT_OVERRIDE
+#endif
+
+//!< Items that are marked as deprecated will be removed in a future release.
+#if __cplusplus >= 201402L
+#define TRT_DEPRECATED [[deprecated]]
+#if __GNUC__ < 6
+#define TRT_DEPRECATED_ENUM
+#else
+#define TRT_DEPRECATED_ENUM TRT_DEPRECATED
+#endif
+#ifdef _MSC_VER
+#define TRT_DEPRECATED_API __declspec(dllexport)
+#else
+#define TRT_DEPRECATED_API [[deprecated]] __attribute__((visibility("default")))
+#endif
+#else
+#ifdef _MSC_VER
+#define TRT_DEPRECATED
+#define TRT_DEPRECATED_ENUM
+#define TRT_DEPRECATED_API __declspec(dllexport)
+#else
+#define TRT_DEPRECATED __attribute__((deprecated))
+#define TRT_DEPRECATED_ENUM
+#define TRT_DEPRECATED_API __attribute__((deprecated, visibility("default")))
+#endif
+#endif
+
+//!< Defines which symbols are exported
+#ifdef TENSORRT_BUILD_LIB
+#ifdef _MSC_VER
+#define TENSORRTAPI __declspec(dllexport)
+#else
+#define TENSORRTAPI __attribute__((visibility("default")))
+#endif
+#else
+#define TENSORRTAPI
+#endif
+#define TRTNOEXCEPT
+//!
+//! \file NvInferRuntimeCommon.h
+//!
+//! This is the top-level API file for TensorRT core runtime library.
+//!
+
+// forward declare some CUDA types to avoid an include dependency
+
+struct cublasContext;
+struct cudnnContext;
+
+typedef struct CUstream_st* cudaStream_t; //!< Forward declaration of cudaStream_t.
+typedef struct CUevent_st* cudaEvent_t;   //!< Forward declaration of cudaEvent_t.
+
+static const int NV_TENSORRT_VERSION = (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + NV_TENSORRT_PATCH; // major, minor, patch
+
+//!
+//! \namespace nvinfer1
+//!
+//! \brief The TensorRT API version 1 namespace.
+//!
+namespace nvinfer1
+{
+
+class IErrorRecorder; //!< Forward declare IErrorRecorder for use in other interfaces.
+class IGpuAllocator; //!< Forward declare IGpuAllocator for use in other interfaces.
+
+//!
+//! \enum ActivationType
+//!
+//! \brief Enumerates the types of activation to perform in an activation layer.
+//!
+enum class ActivationType : int
+{
+    kRELU = 0,             //!< Rectified linear activation.
+    kSIGMOID = 1,          //!< Sigmoid activation.
+    kTANH = 2,             //!< TanH activation.
+    kLEAKY_RELU = 3,       //!< LeakyRelu activation: x>=0 ? x : alpha * x.
+    kELU = 4,              //!< Elu activation: x>=0 ? x : alpha * (exp(x) - 1).
+    kSELU = 5,             //!< Selu activation: x>0 ? beta * x : beta * (alpha*exp(x) - alpha)
+    kSOFTSIGN = 6,         //!< Softsign activation: x / (1+|x|)
+    kSOFTPLUS = 7,         //!< Parametric softplus activation: alpha*log(exp(beta*x)+1)
+    kCLIP = 8,             //!< Clip activation: max(alpha, min(beta, x))
+    kHARD_SIGMOID = 9,     //!< Hard sigmoid activation: max(0, min(1, alpha*x+beta))
+    kSCALED_TANH = 10,     //!< Scaled tanh activation: alpha*tanh(beta*x)
+    kTHRESHOLDED_RELU = 11 //!< Thresholded ReLU activation: x>alpha ? x : 0
+};
+
+template <typename T>
+constexpr inline int EnumMax(); //!< Maximum number of elements in an enumeration type.
+
+template <>
+constexpr inline int EnumMax<ActivationType>()
+{
+    return 12;
+} //!< Maximum number of elements in ActivationType enum. \see ActivationType
+
+//!
+//! \enum DataType
+//! \brief The type of weights and tensors.
+//!
+enum class DataType : int
+{
+    kFLOAT = 0, //!< FP32 format.
+    kHALF = 1,  //!< FP16 format.
+    kINT8 = 2,  //!< quantized INT8 format.
+    kINT32 = 3  //!< INT32 format.
+};
+
+template <>
+constexpr inline int EnumMax<DataType>()
+{
+    return 4;
+} //!< Maximum number of elements in DataType enum. \see DataType
+
+//!
+//! \enum DimensionType
+//! \brief The type of data encoded across this dimension.
+//!
+enum class DimensionType : int
+{
+    kSPATIAL = 0, //!< Elements correspond to different spatial data.
+    kCHANNEL = 1, //!< Elements correspond to different channels.
+    kINDEX = 2,   //!< Elements correspond to different batch index.
+    kSEQUENCE = 3 //!< Elements correspond to different sequence values.
+};
+
+template <>
+constexpr inline int EnumMax<DimensionType>()
+{
+    return 4;
+} //!< Maximum number of elements in DimensionType enum. \see DimensionType
+
+//!
+//! \class Dims
+//! \brief Structure to define the dimensions of a tensor.
+//!
+//! \note: Currently the following formats are supported for layer inputs and outputs:
+//! * zero or more index dimensions followed by one channel and two spatial dimensions (e.g. CHW)
+//! * one time series dimension followed by one index dimension followed by one channel dimension (i.e. TNC)
+//!
+//! TensorRT can also return an invalid dims structure. This structure is represented by nbDims == -1
+//! and d[i] == 0 for all d.
+//!
+class Dims
+{
+public:
+    static const int MAX_DIMS = 8; //!< The maximum number of dimensions supported for a tensor.
+    int nbDims;                    //!< The number of dimensions.
+    int d[MAX_DIMS];               //!< The extent of each dimension.
+    TRT_DEPRECATED DimensionType type[MAX_DIMS];  //!< The type of each dimension.
+};
+
+//!
+//! \brief It is capable of representing one or more TensorFormat by binary OR
+//! operations, e.g., 1U << TensorFormats::kCHW4 | 1U << TensorFormats::kCHW32.
+//!
+//! \see ITensor::getAllowedFormats(), ITensor::setAllowedFormats(),
+//!
+typedef uint32_t TensorFormats;
+
+//!
+//! \enum TensorFormat
+//!
+//! \brief Format of the input/output tensors.
+//!
+//! This enum is extended to be used by both plugins and reformat-free network
+//! I/O tensors.
+//!
+//! \see IPluginExt::getPluginFormats(), safe::ICudaEngine::getBindingFormat()
+//!
+//! For more information about data formats, see the topic "Data Format Description" located in the
+//! TensorRT Developer Guide (https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html).
+//!
+enum class TensorFormat : int
+{
+    //! Row major linear format.
+    //! For a tensor with dimensions {N, C, H, W} or {numbers, channels,
+    //! columns, rows}, the dimensional index corresponds to {3, 2, 1, 0}
+    //! and thus the order is W minor.
+    kLINEAR = 0,
+    kNCHW TRT_DEPRECATED_ENUM = kLINEAR, //! <-- Deprecated, used for backward compatibility
+
+    //! Two wide channel vectorized row major format. This format is bound to
+    //! FP16. It is only available for dimensions >= 3.
+    //! For a tensor with dimensions {N, C, H, W},
+    //! the memory layout is equivalent to a C array with dimensions
+    //! [N][(C+1)/2][H][W][2], with the tensor coordinates (n, c, h, w)
+    //! mapping to array subscript [n][c/2][h][w][c%2].
+    kCHW2 = 1,
+    kNC2HW2 TRT_DEPRECATED_ENUM = kCHW2, //! <-- Deprecated, used for backward compatibility
+
+    //! Eight channel format where C is padded to a multiple of 8. This format
+    //! is bound to FP16. It is only available for dimensions >= 3.
+    //! For a tensor with dimensions {N, H, W, C},
+    //! the memory layout is equivalent to the array with dimensions
+    //! [N][H][W][(C+7)/8*8], with the tensor coordinates (n, h, w, c)
+    //! mapping to array subscript [n][h][w][c].
+    kHWC8 = 2,
+    kNHWC8 TRT_DEPRECATED_ENUM = kHWC8, //! <-- Deprecated, used for backward compatibility
+
+    //! Four wide channel vectorized row major format. This format is bound to
+    //! INT8 or FP16. It is only available for dimensions >= 3.
+    //! For a tensor with dimensions {N, C, H, W},
+    //! the memory layout is equivalent to a C array with dimensions
+    //! [N][(C+3)/4][H][W][4], with the tensor coordinates (n, c, h, w)
+    //! mapping to array subscript [n][c/4][h][w][c%4].
+    kCHW4 = 3,
+
+    //! Sixteen wide channel vectorized row major format. This format is bound
+    //! to FP16. It is only available for dimensions >= 3.
+    //! For a tensor with dimensions {N, C, H, W},
+    //! the memory layout is equivalent to a C array with dimensions
+    //! [N][(C+15)/16][H][W][16], with the tensor coordinates (n, c, h, w)
+    //! mapping to array subscript [n][c/16][h][w][c%16].
+    kCHW16 = 4,
+
+    //! Thirty-two wide channel vectorized row major format. This format is
+    //! bound to INT8 or FP32. It is only available for dimensions >= 3.
+    //! For a tensor with dimensions {N, C, H, W},
+    //! the memory layout is equivalent to a C array with dimensions
+    //! [N][(C+31)/32][H][W][32], with the tensor coordinates (n, c, h, w)
+    //! mapping to array subscript [n][c/32][h][w][c%32].
+    kCHW32 = 5
+};
+
+//!
+//! \brief PluginFormat is reserved for backward compatibility.
+//!
+//! \see IPluginExt::getPluginFormats()
+//!
+using PluginFormat = TensorFormat;
+
+template <>
+constexpr inline int EnumMax<TensorFormat>()
+{
+    return 6;
+} //!< Maximum number of elements in TensorFormat enum. \see TensorFormat
+
+//! \struct PluginTensorDesc
+//!
+//! \brief Fields that a plugin might see for an input or output.
+//!
+//! Scale is only valid when data type is DataType::kINT8. TensorRT will set
+//! the value to -1.0f if it is invalid.
+//!
+//! \see IPluginV2IOExt::supportsFormat
+//! \see IPluginV2IOExt::configurePlugin
+//!
+struct PluginTensorDesc
+{
+    Dims dims;
+    DataType type;
+    TensorFormat format;
+    float scale;
+};
+
+//! \struct PluginVersion
+//!
+//! \brief Definition of plugin versions.
+//!
+//! Tag for plug-in versions.  Used in upper byte of getTensorRTVersion().
+//!
+enum class PluginVersion : uint8_t
+{
+    kV2 = 0,            //! IPluginV2
+    kV2_EXT = 1,        //! IPluginV2Ext
+    kV2_IOEXT = 2,      //! IPluginV2IOExt
+    kV2_DYNAMICEXT = 3, //! IPluginV2DynamicExt
+};
+
+//! \class IPluginV2
+//!
+//! \brief Plugin class for user-implemented layers.
+//!
+//! Plugins are a mechanism for applications to implement custom layers. When
+//! combined with IPluginCreator it provides a mechanism to register plugins and
+//! look up the Plugin Registry during de-serialization.
+//!
+//! \see IPluginCreator
+//! \see IPluginRegistry
+//!
+class IPluginV2
+{
+public:
+    //!
+    //! \brief Return the API version with which this plugin was built.
+    //!
+    //! Do not override this method as it is used by the TensorRT library to maintain backwards-compatibility with plugins.
+    //!
+    virtual int getTensorRTVersion() const TRTNOEXCEPT
+    {
+        return NV_TENSORRT_VERSION;
+    }
+
+    //!
+    //! \brief Return the plugin type. Should match the plugin name returned by the corresponding plugin creator
+    // \see IPluginCreator::getPluginName()
+    //!
+    virtual const char* getPluginType() const TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Return the plugin version. Should match the plugin version returned by the corresponding plugin creator
+    // \see IPluginCreator::getPluginVersion()
+    //!
+    virtual const char* getPluginVersion() const TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Get the number of outputs from the layer.
+    //!
+    //! \return The number of outputs.
+    //!
+    //! This function is called by the implementations of INetworkDefinition and IBuilder. In particular, it is called prior to any call to initialize().
+    //!
+    virtual int getNbOutputs() const TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Get the dimension of an output tensor.
+    //!
+    //! \param index The index of the output tensor.
+    //! \param inputs The input tensors.
+    //! \param nbInputDims The number of input tensors.
+    //!
+    //! This function is called by the implementations of INetworkDefinition and IBuilder. In particular, it is called prior to any call to initialize().
+    //!
+    virtual Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Check format support.
+    //!
+    //! \param type DataType requested.
+    //! \param format PluginFormat requested.
+    //! \return true if the plugin supports the type-format combination.
+    //!
+    //! This function is called by the implementations of INetworkDefinition, IBuilder, and safe::ICudaEngine/ICudaEngine.
+    //! In particular, it is called when creating an engine and when deserializing an engine.
+    //!
+    //! \warning for the format field, the values PluginFormat::kCHW4, PluginFormat::kCHW16, and PluginFormat::kCHW32
+    //! will not be passed in, this is to keep backward compatibility with TensorRT 5.x series.  Use PluginV2IOExt
+    //! or PluginV2DynamicExt for other PluginFormats.
+    //!
+    virtual bool supportsFormat(DataType type, PluginFormat format) const TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Configure the layer.
+    //!
+    //! This function is called by the builder prior to initialize(). It provides an opportunity for the layer to make algorithm choices on the basis
+    //! of its weights, dimensions, and maximum batch size.
+    //!
+    //! \param inputDims The input tensor dimensions.
+    //! \param nbInputs The number of inputs.
+    //! \param outputDims The output tensor dimensions.
+    //! \param nbOutputs The number of outputs.
+    //! \param type The data type selected for the engine.
+    //! \param format The format selected for the engine.
+    //! \param maxBatchSize The maximum batch size.
+    //!
+    //! The dimensions passed here do not include the outermost batch size (i.e. for 2-D image networks, they will be 3-dimensional CHW dimensions).
+    //!
+    //! \warning for the format field, the values PluginFormat::kCHW4, PluginFormat::kCHW16, and PluginFormat::kCHW32
+    //! will not be passed in, this is to keep backward compatibility with TensorRT 5.x series.  Use PluginV2IOExt
+    //! or PluginV2DynamicExt for other PluginFormats.
+    //!
+    virtual void configureWithFormat(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, DataType type, PluginFormat format, int maxBatchSize) TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Initialize the layer for execution. This is called when the engine is created.
+    //!
+    //! \return 0 for success, else non-zero (which will cause engine termination).
+    //!
+    virtual int initialize() TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Release resources acquired during plugin layer initialization. This is called when the engine is destroyed.
+    //! \see initialize()
+    //!
+    virtual void terminate() TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Find the workspace size required by the layer.
+    //!
+    //! This function is called during engine startup, after initialize(). The workspace size returned should be sufficient for any
+    //! batch size up to the maximum.
+    //!
+    //! \return The workspace size.
+    //!
+    virtual size_t getWorkspaceSize(int maxBatchSize) const TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Execute the layer.
+    //!
+    //! \param batchSize The number of inputs in the batch.
+    //! \param inputs The memory for the input tensors.
+    //! \param outputs The memory for the output tensors.
+    //! \param workspace Workspace for execution.
+    //! \param stream The stream in which to execute the kernels.
+    //!
+    //! \return 0 for success, else non-zero (which will cause engine termination).
+    //!
+    virtual int enqueue(int batchSize, const void* const* inputs, void** outputs, void* workspace, cudaStream_t stream) TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Find the size of the serialization buffer required.
+    //!
+    //! \return The size of the serialization buffer.
+    //!
+    virtual size_t getSerializationSize() const TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Serialize the layer.
+    //!
+    //! \param buffer A pointer to a buffer to serialize data. Size of buffer must be equal to value returned by getSerializationSize.
+    //!
+    //! \see getSerializationSize()
+    //!
+    virtual void serialize(void* buffer) const TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Destroy the plugin object. This will be called when the network, builder or engine is destroyed.
+    //!
+    virtual void destroy() TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Clone the plugin object. This copies over internal plugin parameters and returns a new plugin object with these parameters.
+    //!
+    virtual IPluginV2* clone() const TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Set the namespace that this plugin object belongs to. Ideally, all plugin
+    //! objects from the same plugin library should have the same namespace.
+    //!
+    virtual void setPluginNamespace(const char* pluginNamespace) TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Return the namespace of the plugin object.
+    //!
+    virtual const char* getPluginNamespace() const TRTNOEXCEPT = 0;
+
+protected:
+    virtual ~IPluginV2() {}
+};
+
+//! \class IPluginV2Ext
+//!
+//! \brief Plugin class for user-implemented layers.
+//!
+//! Plugins are a mechanism for applications to implement custom layers. This
+//! interface provides additional capabilities to the IPluginV2 interface by
+//! supporting different output data types and broadcast across batch.
+//!
+//! \see IPluginV2
+//!
+class IPluginV2Ext : public IPluginV2
+{
+public:
+    //!
+    //! \brief Return the DataType of the plugin output at the requested index.
+    //! The default behavior should be to return the type of the first input, or DataType::kFLOAT if the layer has no inputs.
+    //! The returned data type must have a format that is supported by the plugin.
+    //! \see supportsFormat()
+    //!
+    virtual nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRTNOEXCEPT = 0;
+
+    //! \brief Return true if output tensor is broadcast across a batch.
+    //!
+    //! \param outputIndex The index of the output
+    //! \param inputIsBroadcasted The ith element is true if the tensor for the ith input is broadcast across a batch.
+    //! \param nbInputs The number of inputs
+    //!
+    //! The values in inputIsBroadcasted refer to broadcasting at the semantic level,
+    //! i.e. are unaffected by whether method canBroadcastInputAcrossBatch requests
+    //! physical replication of the values.
+    //!
+    virtual bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRTNOEXCEPT = 0;
+
+    //! \brief Return true if plugin can use input that is broadcast across batch without replication.
+    //!
+    //! \param inputIndex Index of input that could be broadcast.
+    //!
+    //! For each input whose tensor is semantically broadcast across a batch,
+    //! TensorRT calls this method before calling configurePlugin.
+    //! If canBroadcastInputAcrossBatch returns true, TensorRT will not replicate the input tensor;
+    //! i.e., there will be a single copy that the plugin should share across the batch.
+    //! If it returns false, TensorRT will replicate the input tensor
+    //! so that it appears like a non-broadcasted tensor.
+    //!
+    //! This method is called only for inputs that can be broadcast.
+    //!
+    virtual bool canBroadcastInputAcrossBatch(int inputIndex) const TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Configure the layer with input and output data types.
+    //!
+    //! This function is called by the builder prior to initialize(). It provides an opportunity for the layer to make algorithm choices on the basis
+    //! of its weights, dimensions, data types and maximum batch size.
+    //!
+    //! \param inputDims The input tensor dimensions.
+    //! \param nbInputs The number of inputs.
+    //! \param outputDims The output tensor dimensions.
+    //! \param nbOutputs The number of outputs.
+    //! \param inputTypes The data types selected for the plugin inputs.
+    //! \param outputTypes The data types selected for the plugin outputs.
+    //! \param inputIsBroadcast True for each input that the plugin must broadcast across the batch.
+    //! \param outputIsBroadcast True for each output that TensorRT will broadcast across the batch.
+    //! \param floatFormat The format selected for the engine for the floating point inputs/outputs.
+    //! \param maxBatchSize The maximum batch size.
+    //!
+    //! The dimensions passed here do not include the outermost batch size (i.e. for 2-D image networks, they will be 3-dimensional CHW dimensions).
+    //! When inputIsBroadcast or outputIsBroadcast is true, the outermost batch size for that input or output should be treated as if it is one.
+    //! \ref inputIsBroadcast[i] is true only if the input is semantically broadcast across the batch and \ref canBroadcastInputAcrossBatch(i) returned true.
+    //! \ref outputIsBroadcast[i] is true only if \ref isOutputBroadcastAcrossBatch(i) returned true.
+    //!
+    //! \warning for the floatFormat field, the values PluginFormat::kCHW4, PluginFormat::kCHW16, and PluginFormat::kCHW32
+    //! will not be passed in, this is to keep backward compatibility with TensorRT 5.x series.  Use PluginV2IOExt
+    //! or PluginV2DynamicExt for other PluginFormats.
+    //!
+
+    virtual void configurePlugin(const Dims* inputDims, int nbInputs, const Dims* outputDims,
+                                 int nbOutputs, const DataType* inputTypes, const DataType* outputTypes,
+                                 const bool* inputIsBroadcast, const bool* outputIsBroadcast, PluginFormat floatFormat, int maxBatchSize) TRTNOEXCEPT = 0;
+
+    virtual ~IPluginV2Ext() {}
+
+    //!
+    //! \brief Attach the plugin object to an execution context and grant the plugin the access to some context resource.
+    //!
+    //! \param cudnn The cudnn context handle of the execution context
+    //! \param cublas The cublas context handle of the execution context
+    //! \param allocator The allocator used by the execution context
+    //!
+    //! This function is called automatically for each plugin when a new execution context is created.
+    //! If the plugin needs per-context resource, it can be allocated here.
+    //! The plugin can also get context-owned CUDNN and CUBLAS context here.
+    //!
+    virtual void attachToContext(cudnnContext* /*cudnn*/, cublasContext* /*cublas*/, IGpuAllocator* /*allocator*/) TRTNOEXCEPT {}
+
+    //!
+    //! \brief Detach the plugin object from its execution context.
+    //!
+    //! This function is called automatically for each plugin when a execution context is destroyed.
+    //! If the plugin owns per-context resource, it can be released here.
+    //!
+    virtual void detachFromContext() TRTNOEXCEPT {}
+
+    //!
+    //! \brief Clone the plugin object. This copies over internal plugin parameters as well and returns a new plugin object with these parameters.
+    //! If the source plugin is pre-configured with configurePlugin(), the returned object should also be pre-configured. The returned object should allow attachToContext() with a new execution context.
+    //! Cloned plugin objects can share the same per-engine immutable resource (e.g. weights) with the source object (e.g. via ref-counting) to avoid duplication.
+    //!
+    virtual IPluginV2Ext* clone() const _TENSORRT_OVERRIDE TRTNOEXCEPT = 0;
+
+protected:
+    //!
+    //! \brief Return the API version with which this plugin was built. The
+    //!  upper byte reserved by TensorRT and is used to differentiate this from IPlguinV2.
+    //!
+    //! Do not override this method as it is used by the TensorRT library to maintain backwards-compatibility with plugins.
+    //!
+    int getTensorRTVersion() const _TENSORRT_OVERRIDE TRTNOEXCEPT
+    {
+        return (static_cast<int>(PluginVersion::kV2_EXT) << 24 | (NV_TENSORRT_VERSION & 0xFFFFFF));
+    }
+
+    //!
+    //! \brief Derived classes should not implement this. In a C++11 API it would be override final.
+    //!
+    void configureWithFormat(const Dims* /*inputDims*/, int /*nbInputs*/, const Dims* /*outputDims*/,
+                             int /*nbOutputs*/, DataType /*type*/, PluginFormat /*format*/, int /*maxBatchSize*/) _TENSORRT_OVERRIDE TRTNOEXCEPT {}
+};
+
+//! \class IPluginV2IOExt
+//!
+//! \brief Plugin class for user-implemented layers.
+//!
+//! Plugins are a mechanism for applications to implement custom layers. This interface provides additional
+//! capabilities to the IPluginV2Ext interface by extending different I/O data types and tensor formats.
+//!
+//! \see IPluginV2Ext
+//!
+class IPluginV2IOExt : public IPluginV2Ext
+{
+public:
+    //!
+    //! \brief Configure the layer.
+    //!
+    //! This function is called by the builder prior to initialize(). It provides an opportunity for the layer to make
+    //! algorithm choices on the basis of I/O PluginTensorDesc and the maximum batch size.
+    //!
+    //! \param in The input tensors attributes that are used for configuration.
+    //! \param nbInput Number of input tensors.
+    //! \param out The output tensors attributes that are used for configuration.
+    //! \param nbOutput Number of output tensors.
+    //!
+    virtual void configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Return true if plugin supports the format and datatype for the input/output indexed by pos.
+    //!
+    //! For this method inputs are numbered 0..(nbInputs-1) and outputs are numbered nbInputs..(nbInputs+nbOutputs-1).
+    //! Using this numbering, pos is an index into InOut, where 0 <= pos < nbInputs+nbOutputs-1.
+    //!
+    //! TensorRT invokes this method to ask if the input/output indexed by pos supports the format/datatype specified
+    //! by inOut[pos].format and inOut[pos].type. The override should return true if that format/datatype at inOut[pos]
+    //! are supported by the plugin. If support is conditional on other input/output formats/datatypes, the plugin can
+    //! make its result conditional on the formats/datatypes in inOut[0..pos-1], which will be set to values
+    //! that the plugin supports. The override should not inspect inOut[pos+1..nbInputs+nbOutputs-1],
+    //! which will have invalid values.  In other words, the decision for pos must be based on inOut[0..pos] only.
+    //!
+    //! Some examples:
+    //!
+    //! * A definition for a plugin that supports only FP16 NCHW:
+    //!
+    //!         return inOut.format[pos] == TensorFormat::kLINEAR && inOut.type[pos] == DataType::kHALF;
+    //!
+    //! * A definition for a plugin that supports only FP16 NCHW for its two inputs,
+    //!   and FP32 NCHW for its single output:
+    //!
+    //!         return inOut.format[pos] == TensorFormat::kLINEAR && (inOut.type[pos] == pos < 2 ?  DataType::kHALF : DataType::kFLOAT);
+    //!
+    //! * A definition for a "polymorphic" plugin with two inputs and one output that supports
+    //!   any format or type, but the inputs and output must have the same format and type:
+    //!
+    //!         return pos == 0 || (inOut.format[pos] == inOut.format[0] && inOut.type[pos] == inOut.type[0]);
+    //!
+    //! Warning: TensorRT will stop asking for formats once it finds kFORMAT_COMBINATION_LIMIT on combinations.
+    //!
+    virtual bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const TRTNOEXCEPT = 0;
+
+protected:
+    //!
+    //! \brief Return the API version with which this plugin was built. The upper byte is reserved by TensorRT and is
+    //! used to differentiate this from IPlguinV2 and IPluginV2Ext.
+    //!
+    //! Do not override this method as it is used by the TensorRT library to maintain backwards-compatibility with
+    //! plugins.
+    //!
+    TRT_DEPRECATED
+    int getTensorRTVersion() const _TENSORRT_OVERRIDE
+    {
+        return (static_cast<int>(PluginVersion::kV2_IOEXT) << 24 | (NV_TENSORRT_VERSION & 0xFFFFFF));
+    }
+
+    //!
+    //! \brief Deprecated interface inheriting from base class. Derived classes should not implement this. In a C++11
+    //! API it would be override final.
+    //!
+    TRT_DEPRECATED
+    void configureWithFormat(
+        const Dims*, int, const Dims*, int, DataType, PluginFormat, int) _TENSORRT_OVERRIDE _TENSORRT_FINAL
+    {
+    }
+
+    //!
+    //! \brief Deprecated interface inheriting from base class. Derived classes should not implement this. In a C++11
+    //! API it would be override final.
+    //!
+    TRT_DEPRECATED
+    void configurePlugin(const Dims*, int, const Dims*, int, const DataType*, const DataType*, const bool*, const bool*,
+        PluginFormat, int) _TENSORRT_OVERRIDE _TENSORRT_FINAL
+    {
+    }
+
+    //!
+    //! \brief Deprecated interface inheriting from base class. Derived classes should not implement this. In a C++11
+    //! API it would be override final.
+    //!
+    TRT_DEPRECATED
+    bool supportsFormat(DataType, PluginFormat) const _TENSORRT_OVERRIDE _TENSORRT_FINAL
+    {
+        return false;
+    }
+};
+
+//!
+//! \enum FieldType
+//! \brief The possible field types for custom layer.
+//!
+
+enum class PluginFieldType : int
+{
+    kFLOAT16 = 0, //!< FP16 field type.
+    kFLOAT32 = 1, //!< FP32 field type.
+    kFLOAT64 = 2, //!< FP64 field type.
+    kINT8 = 3,    //!< INT8 field type.
+    kINT16 = 4,   //!< INT16 field type.
+    kINT32 = 5,   //!< INT32 field type.
+    kCHAR = 6,    //!< char field type.
+    kDIMS = 7,    //!< nvinfer1::Dims field type.
+    kUNKNOWN = 8
+};
+
+//!
+//! \class PluginField
+//!
+//! \brief Structure containing plugin attribute field names and associated data
+//! This information can be parsed to decode necessary plugin metadata
+//!
+//!
+struct PluginField
+{
+    //!
+    //! \brief Plugin field attribute name
+    //!
+    const char* name{nullptr};
+    //!
+    //! \brief Plugin field attribute data
+    //!
+    const void* data{nullptr};
+    //!
+    //! \brief Plugin field attribute type
+    //! \see PluginFieldType
+    //!
+    PluginFieldType type{PluginFieldType::kUNKNOWN};
+    //!
+    //! \brief Number of data entries in the Plugin attribute
+    //!
+    int32_t length{0};
+
+    PluginField(const char* name_ = nullptr, const void* data_ = nullptr, const PluginFieldType type_ = PluginFieldType::kUNKNOWN, int32_t length_ = 0)
+        : name(name_)
+        , data(data_)
+        , type(type_)
+        , length(length_)
+    {
+    }
+};
+
+struct PluginFieldCollection
+{
+    int nbFields;              //!< Number of PluginField entries
+    const PluginField* fields; //!< Pointer to PluginField entries
+};
+
+//!
+//! \class IPluginCreator
+//!
+//! \brief Plugin creator class for user implemented layers.
+//!
+//! \see IPlugin and IPluginFactory
+//!
+
+class IPluginCreator
+{
+public:
+    //!
+    //! \brief Return the version of the API the plugin creator was compiled with.
+    //!
+    virtual int getTensorRTVersion() const TRTNOEXCEPT { return NV_TENSORRT_VERSION; }
+
+    //!
+    //! \brief Return the plugin name.
+    //!
+    virtual const char* getPluginName() const TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Return the plugin version.
+    //!
+    virtual const char* getPluginVersion() const TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Return a list of fields that needs to be passed to createPlugin.
+    //! \see PluginFieldCollection
+    //!
+    virtual const PluginFieldCollection* getFieldNames() TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Return a plugin object. Return nullptr in case of error.
+    //!
+    virtual IPluginV2* createPlugin(const char* name, const PluginFieldCollection* fc) TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Called during deserialization of plugin layer. Return a plugin object.
+    //!
+    virtual IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Set the namespace of the plugin creator based on the plugin
+    //! library it belongs to. This can be set while registering the plugin creator.
+    //!
+    //! \see IPluginRegistry::registerCreator()
+    //!
+    virtual void setPluginNamespace(const char* pluginNamespace) TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief Return the namespace of the plugin creator object.
+    //!
+    virtual const char* getPluginNamespace() const TRTNOEXCEPT = 0;
+
+    virtual ~IPluginCreator() {}
+};
+
+//!
+//! \class IPluginRegistry
+//!
+//! \brief Single registration point for all plugins in an application. It is
+//! used to find plugin implementations during engine deserialization.
+//! Internally, the plugin registry is considered to be a singleton so all
+//! plugins in an application are part of the same global registry.
+//! Note that the plugin registry is only supported for plugins of type
+//! IPluginV2 and should also have a corresponding IPluginCreator implementation.
+//!
+//! \see IPluginV2 and IPluginCreator
+//!
+//! \warning Do not inherit from this class, as doing so will break forward-compatibility of the API and ABI.
+//!
+
+class IPluginRegistry
+{
+public:
+    //!
+    //! \brief Register a plugin creator. Returns false if one with same type
+    //! is already registered.
+    //!
+    virtual bool registerCreator(IPluginCreator& creator, const char* pluginNamespace) noexcept = 0;
+
+    //!
+    //! \brief Return all the registered plugin creators and the number of
+    //! registered plugin creators. Returns nullptr if none found.
+    //!
+    virtual IPluginCreator* const* getPluginCreatorList(int* numCreators) const noexcept = 0;
+
+    //!
+    //! \brief Return plugin creator based on plugin type, version and
+    //! namespace associated with plugin during network creation.
+    //!
+    virtual IPluginCreator* getPluginCreator(const char* pluginType, const char* pluginVersion, const char* pluginNamespace = "") noexcept = 0;
+
+protected:
+    virtual ~IPluginRegistry() noexcept {}
+
+public:
+    //!
+    //! \brief Set the ErrorRecorder for this interface
+    //!
+    //! Assigns the ErrorRecorder to this interface. The ErrorRecorder will track all errors during execution.
+    //! This function will call incRefCount of the registered ErrorRecorder at least once. Setting
+    //! recorder to nullptr unregisters the recorder with the interface, resulting in a call to decRefCount if
+    //! a recorder has been registered.
+    //!
+    //! \param recorder The error recorder to register with this interface.
+    //
+    //! \see getErrorRecorder
+    //!
+    virtual void setErrorRecorder(IErrorRecorder* recorder) noexcept = 0;
+
+    //!
+    //! \brief set the ErrorRecorder assigned to this interface.
+    //!
+    //! Retrieves the assigned error recorder object for the given class. A default error recorder does not exist,
+    //! so a nullptr will be returned if setErrorRecorder has not been called, or an ErrorRecorder has not been
+    //! inherited.
+    //!
+    //! \return A pointer to the IErrorRecorder object that has been registered.
+    //!
+    //! \see setErrorRecorder
+    //!
+    virtual IErrorRecorder* getErrorRecorder() const noexcept = 0;
+};
+
+
+//!
+//! \enum TensorLocation
+//! \brief The location for tensor data storage, device or host.
+//!
+enum class TensorLocation : int
+{
+    kDEVICE = 0, //!< Data stored on device.
+    kHOST = 1,   //!< Data stored on host.
+};
+
+template <>
+constexpr inline int EnumMax<TensorLocation>()
+{
+    return 2;
+} //!< Maximum number of elements in TensorLocation enum. \see TensorLocation
+
+//!
+//! \class IGpuAllocator
+//!
+//! \brief Application-implemented class for controlling allocation on the GPU.
+//!
+class IGpuAllocator
+{
+public:
+    //!
+    //! A callback implemented by the application to handle acquisition of GPU memory.
+    //!
+    //! \param size The size of the memory required.
+    //! \param alignment The required alignment of memory. Alignment will zero
+    //!        or a power of 2 not exceeding the alignment guaranteed by cudaMalloc.
+    //!        Thus this allocator can be safely implemented with cudaMalloc/cudaFree.
+    //!        An alignment value of zero indicates any alignment is acceptable.
+    //! \param flags Reserved for future use. In the current release, 0 will be passed.
+    //!
+    //! If an allocation request of size 0 is made, nullptr should be returned.
+    //!
+    //! If an allocation request cannot be satisfied, nullptr should be returned.
+    //!
+    virtual void* allocate(uint64_t size, uint64_t alignment, uint32_t flags) TRTNOEXCEPT = 0;
+
+    //!
+    //! A callback implemented by the application to handle release of GPU memory.
+    //!
+    //! TensorRT may pass a nullptr to this function if it was previously returned by allocate().
+    //!
+    //! \param memory The acquired memory.
+    //!
+    virtual void free(void* memory) TRTNOEXCEPT = 0;
+
+    //!
+    //! Destructor declared virtual as general good practice for a class with virtual methods.
+    //! TensorRT never calls the destructor for an IGpuAllocator defined by the application.
+    //!
+    virtual ~IGpuAllocator() {}
+};
+
+//!
+//! \class ILogger
+//!
+//! \brief Application-implemented logging interface for the builder, engine and runtime.
+//!
+//! Note that although a logger is passed on creation to each instance of a IBuilder or safe::IRuntime interface, the logger is internally considered a singleton, and thus
+//! multiple instances of safe::IRuntime and/or IBuilder must all use the same logger.
+//!
+class ILogger
+{
+public:
+    //!
+    //! \enum Severity
+    //!
+    //! The severity corresponding to a log message.
+    //!
+    enum class Severity : int
+    {
+        kINTERNAL_ERROR = 0, //!< An internal error has occurred. Execution is unrecoverable.
+        kERROR = 1,          //!< An application error has occurred.
+        kWARNING = 2,        //!< An application error has been discovered, but TensorRT has recovered or fallen back to a default.
+        kINFO = 3,           //!< Informational messages with instructional information.
+        kVERBOSE = 4,        //!< Verbose messages with debugging information.
+    };
+
+    //!
+    //! A callback implemented by the application to handle logging messages;
+    //!
+    //! \param severity The severity of the message.
+    //! \param msg The log message, null terminated.
+    //!
+    virtual void log(Severity severity, const char* msg) TRTNOEXCEPT = 0;
+
+    virtual ~ILogger() {}
+};
+
+template <>
+constexpr inline int EnumMax<ILogger::Severity>()
+{
+    return 5;
+} //!< Maximum number of elements in ILogger::Severity enum. \see ILogger::Severity
+
+//!
+//! \enum ErrorCode
+//!
+//! \brief Error codes that can be returned by TensorRT during execution.
+//!
+enum class ErrorCode : int
+{
+    //!
+    //! Execution completed successfully.
+    //!
+    kSUCCESS = 0,
+
+    //! 
+    //! An error that does not fall into any other category. This error is included for forward compatibility
+    //!
+    kUNSPECIFIED_ERROR = 1,
+
+    //!
+    //! A non-recoverable TensorRT error occurred.
+    //!
+    kINTERNAL_ERROR = 2,
+
+    //!
+    //! An argument passed to the function is invalid in isolation.
+    //! This is a violation of the API contract
+    //!
+    kINVALID_ARGUMENT = 3,
+
+    //!
+    //! An error occurred when comparing the state of an argument relative to other arguments. For example, the
+    //! dimensions for concat differ between two tensors outside of the channel dimension. This error is triggered
+    //! when an argument is correct in isolation, but not relative to other arguments. This is to help to distinguish
+    //! from the simple errors from the more complex errors.
+    //! This is a violation of the API contract.
+    //!
+    kINVALID_CONFIG = 4,
+
+    //!
+    //! An error occurred when performing an allocation of memory on the host or the device.
+    //! A memory allocation error is normally fatal, but in the case where the application provided its own memory
+    //! allocation routine, it is possible to increase the pool of available memory and resume execution.
+    //!
+    kFAILED_ALLOCATION = 5,
+
+    //!
+    //! One, or more, of the components that TensorRT relies on did not initialize correctly.
+    //! This is a system setup issue.
+    //!
+    kFAILED_INITIALIZATION = 6,
+
+    //!
+    //! An error occurred during execution that caused TensorRT to end prematurely, either an asynchronous error or
+    //! other execution errors reported by CUDA/DLA. In a dynamic system, the
+    //! data can be thrown away and the next frame can be processed or execution can be retried.
+    //! This is either an execution error or a memory error.
+    //!
+    kFAILED_EXECUTION = 7,
+
+    //!
+    //! An error occurred during execution that caused the data to become corrupted, but execution finished. Examples
+    //! of this error are NaN squashing or integer overflow. In a dynamic system, the data can be thrown away and the
+    //! next frame can be processed or execution can be retried.
+    //! This is either a data corruption error, an input error, or a range error.
+    //!
+    kFAILED_COMPUTATION = 8,
+
+    //!
+    //! TensorRT was put into a bad state by incorrect sequence of function calls. An example of an invalid state is
+    //! specifying a layer to be DLA only without GPU fallback, and that layer is not supported by DLA. This can occur
+    //! in situations where a service is optimistically executing networks for multiple different configurations
+    //! without checking proper error configurations, and instead throwing away bad configurations caught by TensorRT.
+    //! This is a violation of the API contract, but can be recoverable.
+    //!
+    //! Example of a recovery:
+    //! GPU fallback is disabled and conv layer with large filter(63x63) is specified to run on DLA. This will fail due
+    //! to DLA not supporting the large kernel size. This can be recovered by either turning on GPU fallback
+    //! or setting the layer to run on the GPU.
+    //!
+    kINVALID_STATE = 9,
+
+    //!
+    //! An error occurred due to the network not being supported on the device due to constraints of the hardware or
+    //! system. An example is running a unsafe layer in a safety certified context, or a resource requirement for the
+    //! current network is greater than the capabilities of the target device. The network is otherwise correct, but
+    //! the network and hardware combination is problematic. This can be recoverable.
+    //! Examples:
+    //!  * Scratch space requests larger than available device memory and can be recovered by increasing allowed
+    //!    workspace size.
+    //!  * Tensor size exceeds the maximum element count and can be recovered by reducing the maximum batch size.
+    //!
+    kUNSUPPORTED_STATE = 10,
+
+};
+
+template <>
+constexpr inline int EnumMax<ErrorCode>()
+{
+    return 11;
+} //!< Maximum number of elements in ErrorCode enum. \see ErrorCode
+
+
+//!
+//! \class IErrorRecorder
+//!
+//! \brief Reference counted application-implemented error reporting interface for TensorRT objects.
+//!
+//! The error reporting mechanism is a user defined object that interacts with the internal state of the object
+//! that it is assigned to in order to determine information about abnormalities in execution. The error recorder
+//! gets both an error enum that is more descriptive than pass/fail and also a string description that gives more
+//! detail on the exact failure modes. In the safety context, the error strings are all limited to 128 characters
+//! in length.
+//! The ErrorRecorder gets passed along to any class that is created from another class that has an ErrorRecorder
+//! assigned to it. For example, assigning an ErrorRecorder to an IBuilder allows all INetwork's, ILayer's, and
+//! ITensor's to use the same error recorder. For functions that have their own ErrorRecorder accessor functions.
+//! This allows registering a different error recorder or de-registering of the error recorder for that specific
+//! object.
+//!
+//! The ErrorRecorder object implementation must be thread safe if the same ErrorRecorder is passed to different
+//! interface objects being executed in parallel in different threads. All locking and synchronization is
+//! pushed to the interface implementation and TensorRT does not hold any synchronization primitives when accessing
+//! the interface functions.
+//!
+class IErrorRecorder
+{
+public:
+    //!
+    //! A typedef of a c-style string for reporting error descriptions.
+    //!
+    using ErrorDesc = const char*;
+
+    //!
+    //! A typedef of a 32bit integer for reference counting.
+    //!
+    using RefCount = int32_t;
+
+    virtual ~IErrorRecorder() noexcept {};
+
+    // Public API’s used to retrieve information from the error recorder.
+
+    //!
+    //! \brief Return the number of errors
+    //!
+    //! Determines the number of errors that occurred between the current point in execution
+    //! and the last time that the clear() was executed. Due to the possibility of asynchronous
+    //! errors occuring, a TensorRT API can return correct results, but still register errors
+    //! with the Error Recorder. The value of getNbErrors must monotonically increases until clear()
+    //! is called.
+    //!
+    //! \return Returns the number of errors detected, or 0 if there are no errors.
+    //!
+    //! \see clear
+    //!
+    virtual int32_t getNbErrors() const noexcept = 0;
+
+    //!
+    //! \brief Returns the ErrorCode enumeration.
+    //!
+    //! \param errorIdx A 32bit integer that indexes into the error array.
+    //!
+    //! The errorIdx specifies what error code from 0 to getNbErrors()-1 that the application
+    //! wants to analyze and return the error code enum.
+    //!
+    //! \return Returns the enum corresponding to errorIdx.
+    //!
+    //! \see getErrorDesc, ErrorCode
+    //!
+    virtual ErrorCode getErrorCode(int32_t errorIdx) const noexcept = 0;
+
+    //!
+    //! \brief Returns the c-style string description of the error.
+    //!
+    //! \param errorIdx A 32bit integer that indexes into the error array.
+    //!
+    //! For the error specified by the idx value, return the string description of the error. The
+    //! error string is a c-style string that is zero delimited. In the safety context there is a
+    //! constant length requirement to remove any dynamic memory allocations and the error message
+    //! may be truncated. The format of the string is "<EnumAsStr> - <Description>".
+    //!
+    //! \return Returns a string representation of the error along with a description of the error.
+    //!
+    //! \see getErrorCode
+    //!
+    virtual ErrorDesc getErrorDesc(int32_t errorIdx) const noexcept = 0;
+
+    //!
+    //! \brief Determine if the error stack has overflowed.
+    //!
+    //! In the case when the number of errors is large, this function is used to query if one or more
+    //! errors have been dropped due to lack of storage capacity. This is especially important in the
+    //! automotive safety case where the internal error handling mechanisms cannot allocate memory.
+    //!
+    //! \return true if errors have been dropped due to overflowing the error stack.
+    //!
+    virtual bool hasOverflowed() const noexcept = 0;
+
+    //!
+    //! \brief Clear the error stack on the error recorder.
+    //!
+    //! Removes all the tracked errors by the error recorder.  This function must guarantee that after
+    //! this function is called, and as long as no error occurs, the next call to getNbErrors will return
+    //! zero.
+    //!
+    //! \see getNbErrors
+    //!
+    virtual void clear() noexcept = 0;
+
+    // API’s used by TensorRT to report Error information to the application.
+
+    //!
+    //! \brief report an error to the error recorder with the corresponding enum and description.
+    //!
+    //! \param val The error code enum that is being reported.
+    //! \param desc The string description of the error.
+    //!
+    //! Report an error to the user that has a given value and human readable description. The function returns false
+    //! if processing can continue, which implies that the reported error is not fatal. This does not guarantee that
+    //! processing continues, but provides a hint to TensorRT.
+    //!
+    //! \return True if the error is determined to be fatal and processing of the current function must end.
+    //!
+    virtual bool reportError(ErrorCode val, ErrorDesc desc) noexcept = 0;
+
+    //!
+    //! \brief Increments the refcount for the current ErrorRecorder.
+    //!
+    //! Increments the reference count for the object by one and returns the current value.
+    //! This reference count allows the application to know that an object inside of TensorRT has
+    //! taken a reference to the ErrorRecorder. If the ErrorRecorder is released before the
+    //! reference count hits zero, then behavior in TensorRT is undefined. It is strongly recommended
+    //! that the increment is an atomic operation. TensorRT guarantees that each incRefCount called on
+    //! an objects construction is paired with a decRefCount call when an object is destructed.
+    //!
+    //! \return The current reference counted value.
+    //!
+    virtual RefCount incRefCount() noexcept = 0;
+
+    //!
+    //! \brief Decrements the refcount for the current ErrorRecorder.
+    //!
+    //! Decrements the reference count for the object by one and returns the current value. It is undefined behavior
+    //! to call decRefCount when RefCount is zero. If the ErrorRecorder is destroyed before the reference count
+    //! hits zero, then behavior in TensorRT is undefined. It is strongly recommended that the decrement is an
+    //! atomic operation. TensorRT guarantees that each decRefCount called when an object is destructed is
+    //! paired with a incRefCount call when that object was constructed.
+    //!
+    //! \return The current reference counted value.
+    //!
+    virtual RefCount decRefCount() noexcept = 0;
+
+}; // class IErrorRecorder
+
+} // namespace nvinfer1
+
+extern "C" TENSORRTAPI void* createSafeInferRuntime_INTERNAL(void* logger, int version); //!< Internal C entry point for creating safe::IRuntime.
+
+//!
+//! \brief Return the logger object.
+//!
+extern "C" TENSORRTAPI nvinfer1::ILogger* getLogger();
+
+//!
+//! \brief Return the library version number.
+//!
+//! The format is as for TENSORRT_VERSION: (TENSORRT_MAJOR * 1000) + (TENSORRT_MINOR * 100) + TENSOR_PATCH.
+//!
+extern "C" TENSORRTAPI int getInferLibVersion();
+
+//!
+//! \brief Return the plugin registry
+//!
+extern "C" TENSORRTAPI nvinfer1::IPluginRegistry* getPluginRegistry();
+
+namespace nvinfer1
+{
+
+//!
+//! \brief Register the plugin creator to the registry
+//! The static registry object will be instantiated when the plugin library is
+//! loaded. This static object will register all creators available in the
+//! library to the registry.
+//!
+template <typename T>
+class PluginRegistrar
+{
+public:
+    PluginRegistrar() { getPluginRegistry()->registerCreator(instance, ""); }
+private:
+    T instance{};
+};
+
+#define REGISTER_TENSORRT_PLUGIN(name) \
+    static nvinfer1::PluginRegistrar<name> pluginRegistrar##name {}
+
+} // namespace nvinfer1
+
+#endif // NV_INFER_RUNTIME_COMMON_H
diff --git a/include/NvInferVersion.h b/include/NvInferVersion.h
new file mode 100644
index 00000000..a277d8c1
--- /dev/null
+++ b/include/NvInferVersion.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef NV_INFER_VERSION_H
+#define NV_INFER_VERSION_H
+
+#define NV_TENSORRT_MAJOR 6 //!< TensorRT major version.
+#define NV_TENSORRT_MINOR 0 //!< TensorRT minor version.
+#define NV_TENSORRT_PATCH 1 //!< TensorRT patch version.
+#define NV_TENSORRT_BUILD 0 //!< TensorRT build number.
+
+#define NV_TENSORRT_SONAME_MAJOR 6 //!< Shared object library major version number.
+#define NV_TENSORRT_SONAME_MINOR 0 //!< Shared object library minor version number.
+#define NV_TENSORRT_SONAME_PATCH 1 //!< Shared object library patch version number.
+
+#endif // NV_INFER_VERSION_H
diff --git a/include/NvOnnxConfig.h b/include/NvOnnxConfig.h
index f72b4b59..efa7cf7d 100644
--- a/include/NvOnnxConfig.h
+++ b/include/NvOnnxConfig.h
@@ -61,7 +61,7 @@ class IOnnxConfig
     //!
     //! \see getModelDtype()
     //!
-    virtual void setModelDtype(const nvinfer1::DataType) = 0;
+    virtual void setModelDtype(const nvinfer1::DataType) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the Model Data Type.
@@ -70,7 +70,7 @@ class IOnnxConfig
     //!
     //! \see setModelDtype() and #DataType
     //!
-    virtual nvinfer1::DataType getModelDtype() const = 0;
+    virtual nvinfer1::DataType getModelDtype() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the Model FileName.
@@ -79,7 +79,7 @@ class IOnnxConfig
     //!
     //! \see setModelFileName()
     //!
-    virtual const char* getModelFileName() const = 0;
+    virtual const char* getModelFileName() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the Model File Name.
@@ -92,7 +92,7 @@ class IOnnxConfig
     //!
     //! \see getModelFileName()
     //!
-    virtual void setModelFileName(const char* onnxFilename) = 0;
+    virtual void setModelFileName(const char* onnxFilename) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the Verbosity Level.
@@ -101,7 +101,7 @@ class IOnnxConfig
     //!
     //! \see addVerbosity(), reduceVerbosity()
     //!
-    virtual Verbosity getVerbosityLevel() const = 0;
+    virtual Verbosity getVerbosityLevel() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Increase the Verbosity Level.
@@ -110,9 +110,9 @@ class IOnnxConfig
     //!
     //! \see addVerbosity(), reduceVerbosity(), setVerbosity(Verbosity)
     //!
-    virtual void addVerbosity() = 0;               //!< Increase verbosity Level.
-    virtual void reduceVerbosity() = 0;            //!< Decrease verbosity Level.
-    virtual void setVerbosityLevel(Verbosity) = 0; //!< Set to specific verbosity Level.
+    virtual void addVerbosity() TRTNOEXCEPT = 0;               //!< Increase verbosity Level.
+    virtual void reduceVerbosity() TRTNOEXCEPT = 0;            //!< Decrease verbosity Level.
+    virtual void setVerbosityLevel(Verbosity) TRTNOEXCEPT = 0; //!< Set to specific verbosity Level.
 
     //!
     //! \brief Returns the File Name of the Network Description as a Text File.
@@ -121,7 +121,7 @@ class IOnnxConfig
     //!
     //! \see setTextFilename()
     //!
-    virtual const char* getTextFileName() const = 0;
+    virtual const char* getTextFileName() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the File Name of the Network Description as a Text File.
@@ -134,7 +134,7 @@ class IOnnxConfig
     //!
     //! \see getTextFilename()
     //!
-    virtual void setTextFileName(const char* textFileName) = 0;
+    virtual void setTextFileName(const char* textFileName) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get the File Name of the Network Description as a Text File, including the weights.
@@ -143,7 +143,7 @@ class IOnnxConfig
     //!
     //! \see setFullTextFilename()
     //!
-    virtual const char* getFullTextFileName() const = 0;
+    virtual const char* getFullTextFileName() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the File Name of the Network Description as a Text File, including the weights.
@@ -156,7 +156,7 @@ class IOnnxConfig
     //!
     //! \see getFullTextFilename()
     //!
-    virtual void setFullTextFileName(const char* fullTextFileName) = 0;
+    virtual void setFullTextFileName(const char* fullTextFileName) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Get whether the layer information will be printed.
@@ -165,19 +165,19 @@ class IOnnxConfig
     //!
     //! \see setPrintLayerInfo()
     //!
-    virtual bool getPrintLayerInfo() const = 0;
+    virtual bool getPrintLayerInfo() const TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set whether the layer information will be printed.
     //!
     //! \see getPrintLayerInfo()
     //!
-    virtual void setPrintLayerInfo(bool) = 0;
+    virtual void setPrintLayerInfo(bool) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Destroy IOnnxConfig object.
     //!
-    virtual void destroy() = 0;
+    virtual void destroy() TRTNOEXCEPT = 0;
 
 }; // class IOnnxConfig
 
@@ -185,4 +185,4 @@ TENSORRTAPI IOnnxConfig* createONNXConfig();
 
 } // namespace nvonnxparser
 
-#endif
+#endif // NV_ONNX_CONFIG_H
diff --git a/include/NvOnnxParser.h b/include/NvOnnxParser.h
index 207e96a5..e6fc63b6 100644
--- a/include/NvOnnxParser.h
+++ b/include/NvOnnxParser.h
@@ -26,6 +26,11 @@
 static const int NV_ONNX_PARSER_VERSION = ((NV_ONNX_PARSER_MAJOR * 10000) + (NV_ONNX_PARSER_MINOR * 100) + NV_ONNX_PARSER_PATCH);
 
 class onnxTensorDescriptorV1;
+//!
+//! \namespace nvonnxparser
+//!
+//! \brief The TensorRT ONNX parser API namespace
+//!
 namespace nvonnxparser
 {
 
diff --git a/include/NvUffParser.h b/include/NvUffParser.h
index ba5d7dfd..db528f23 100644
--- a/include/NvUffParser.h
+++ b/include/NvUffParser.h
@@ -19,11 +19,16 @@
 
 #include "NvInfer.h"
 
-//Current supported Universal Framework Format (UFF) version for the parser.
+// Current supported Universal Framework Format (UFF) version for the parser.
 #define UFF_REQUIRED_VERSION_MAJOR 0
 #define UFF_REQUIRED_VERSION_MINOR 6
-#define UFF_REQUIRED_VERSION_PATCH 3
+#define UFF_REQUIRED_VERSION_PATCH 5
 
+//!
+//! \namespace nvuffparser
+//!
+//! \brief The TensorRT UFF parser API namespace.
+//!
 namespace nvuffparser
 {
 
@@ -70,7 +75,7 @@ class TENSORRTAPI FieldMap
     FieldType type = FieldType::kUNKNOWN;
     int length = 1;
 
-    FieldMap(const char* name, const void* data, const FieldType type, int length = 1);
+    FieldMap(const char* name, const void* data, const FieldType type, int length = 1) TRTNOEXCEPT;
 };
 
 struct FieldCollection
@@ -92,7 +97,7 @@ class IPluginFactory
     //!
     //! \param layerName Name of the layer which the user wishes to validate.
     //!
-    virtual bool isPlugin(const char* layerName) = 0;
+    virtual bool isPlugin(const char* layerName) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Creates a plugin.
@@ -105,7 +110,7 @@ class IPluginFactory
     //! \see FieldCollection
     //!
     virtual nvinfer1::IPlugin* createPlugin(const char* layerName, const nvinfer1::Weights* weights, int nbWeights,
-                                            const FieldCollection fc) = 0;
+                                            const FieldCollection fc) TRTNOEXCEPT = 0;
 
 };
 
@@ -117,7 +122,7 @@ class IPluginFactory
 class IPluginFactoryExt : public IPluginFactory
 {
 public:
-    virtual int getVersion() const
+    virtual int getVersion() const TRTNOEXCEPT
     {
         return NV_TENSORRT_VERSION;
     }
@@ -127,7 +132,7 @@ class IPluginFactoryExt : public IPluginFactory
     //!
     //! \param layerName Name of the layer which the user wishes to validate.
     //!
-    virtual bool isPluginExt(const char* layerName) = 0;
+    virtual bool isPluginExt(const char* layerName) TRTNOEXCEPT = 0;
 };
 
 //!
@@ -147,14 +152,14 @@ class IUffParser
     //! \param inputDims Input dimensions.
     //! \param inputOrder Input order on which the framework input was originally.
     //!
-    virtual bool registerInput(const char* inputName, nvinfer1::Dims inputDims, UffInputOrder inputOrder) = 0;
+    virtual bool registerInput(const char* inputName, nvinfer1::Dims inputDims, UffInputOrder inputOrder) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Register an output name of a UFF network.
     //!
     //! \param outputName Output name.
     //!
-    virtual bool registerOutput(const char* outputName) = 0;
+    virtual bool registerOutput(const char* outputName) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Parse a UFF file.
@@ -165,7 +170,7 @@ class IUffParser
     //!
     virtual bool parse(const char* file,
                        nvinfer1::INetworkDefinition& network,
-                       nvinfer1::DataType weightsType=nvinfer1::DataType::kFLOAT) = 0;
+                       nvinfer1::DataType weightsType=nvinfer1::DataType::kFLOAT) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Parse a UFF buffer, useful if the file already live in memory.
@@ -177,46 +182,73 @@ class IUffParser
     //!
     virtual bool parseBuffer(const char* buffer, std::size_t size,
                              nvinfer1::INetworkDefinition& network,
-                             nvinfer1::DataType weightsType=nvinfer1::DataType::kFLOAT) = 0;
+                             nvinfer1::DataType weightsType=nvinfer1::DataType::kFLOAT) TRTNOEXCEPT = 0;
 
-    virtual void destroy() = 0;
+    virtual void destroy() TRTNOEXCEPT = 0;
 
     //!
     //! \brief Return Version Major of the UFF.
     //!
-    virtual int getUffRequiredVersionMajor() = 0;
+    virtual int getUffRequiredVersionMajor() TRTNOEXCEPT = 0;
 
     //!
     //! \brief Return Version Minor of the UFF.
     //!
-    virtual int getUffRequiredVersionMinor() = 0;
+    virtual int getUffRequiredVersionMinor() TRTNOEXCEPT = 0;
 
     //!
     //! \brief Return Patch Version of the UFF.
     //!
-    virtual int getUffRequiredVersionPatch() = 0;
+    virtual int getUffRequiredVersionPatch() TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the IPluginFactory used to create the user defined plugins.
     //!
     //! \param factory Pointer to an instance of the user implmentation of IPluginFactory.
     //!
-    virtual void setPluginFactory(IPluginFactory* factory) = 0;
+    virtual void setPluginFactory(IPluginFactory* factory) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the IPluginFactoryExt used to create the user defined pluginExts.
     //!
     //! \param factory Pointer to an instance of the user implmentation of IPluginFactoryExt.
     //!
-    virtual void setPluginFactoryExt(IPluginFactoryExt* factory) = 0;
+    virtual void setPluginFactoryExt(IPluginFactoryExt* factory) TRTNOEXCEPT = 0;
 
     //!
     //! \brief Set the namespace used to lookup and create plugins in the network.
     //!
-    virtual void setPluginNamespace(const char* libNamespace) = 0;
+    virtual void setPluginNamespace(const char* libNamespace) TRTNOEXCEPT = 0;
 
 protected:
     virtual ~IUffParser() {}
+
+public:
+    //!
+    //! \brief Set the ErrorRecorder for this interface
+    //!
+    //! Assigns the ErrorRecorder to this interface. The ErrorRecorder will track all errors during execution.
+    //! This function will call incRefCount of the registered ErrorRecorder at least once. Setting
+    //! recorder to nullptr unregisters the recorder with the interface, resulting in a call to decRefCount if
+    //! a recorder has been registered.
+    //!
+    //! \param recorder The error recorder to register with this interface.
+    //
+    //! \see getErrorRecorder
+    //!
+    virtual void setErrorRecorder(nvinfer1::IErrorRecorder* recorder) TRTNOEXCEPT = 0;
+
+    //!
+    //! \brief get the ErrorRecorder assigned to this interface.
+    //!
+    //! Retrieves the assigned error recorder object for the given class. A default error recorder does not exist,
+    //! so a nullptr will be returned if setErrorRecorder has not been called.
+    //!
+    //! \return A pointer to the IErrorRecorder object that has been registered.
+    //!
+    //! \see setErrorRecorder
+    //!
+    virtual nvinfer1::IErrorRecorder* getErrorRecorder() const TRTNOEXCEPT = 0;
 };
 
 //!
@@ -226,17 +258,17 @@ class IUffParser
 //!
 //! \see nvuffparser::IUffParser
 //!
-TENSORRTAPI IUffParser* createUffParser();
+TENSORRTAPI IUffParser* createUffParser() TRTNOEXCEPT;
 
 //!
 //! \brief Shuts down protocol buffers library.
 //!
 //! \note No part of the protocol buffers library can be used after this function is called.
 //!
-TENSORRTAPI void shutdownProtobufLibrary(void);
+TENSORRTAPI void shutdownProtobufLibrary(void) TRTNOEXCEPT;
 
 }
 
-extern "C" TENSORRTAPI void* createNvUffParser_INTERNAL();
+extern "C" TENSORRTAPI void* createNvUffParser_INTERNAL() TRTNOEXCEPT;
 
-#endif /* !NV_UFF_PARSER_H */
+#endif // NV_UFF_PARSER_H
diff --git a/parsers/caffe/CMakeLists.txt b/parsers/caffe/CMakeLists.txt
index 2e69465a..14a46b94 100644
--- a/parsers/caffe/CMakeLists.txt
+++ b/parsers/caffe/CMakeLists.txt
@@ -62,6 +62,7 @@ set_target_properties(${SHARED_TARGET}
 
 target_link_libraries(${SHARED_TARGET} 
     ${Protobuf_LIBRARY}
+    nvinfer
 )
 
 # modify google namespace to avoid namespace collision.
diff --git a/parsers/caffe/CaffeParserSources.txt b/parsers/caffe/CaffeParserSources.txt
index 9ef83162..bcfaf44c 100644
--- a/parsers/caffe/CaffeParserSources.txt
+++ b/parsers/caffe/CaffeParserSources.txt
@@ -32,6 +32,7 @@ set(CAFFE_PARSER_SRCS
     caffeParser/opParsers/parsePermute.cpp
     caffeParser/opParsers/parsePooling.cpp
     caffeParser/opParsers/parsePower.cpp
+    caffeParser/opParsers/parsePReLU.cpp
     caffeParser/opParsers/parseReduction.cpp
     caffeParser/opParsers/parseReLU.cpp
     caffeParser/opParsers/parseReshape.cpp
diff --git a/parsers/caffe/caffeParser/caffeParser.cpp b/parsers/caffe/caffeParser/caffeParser.cpp
index 079dbe39..511bbf21 100644
--- a/parsers/caffe/caffeParser/caffeParser.cpp
+++ b/parsers/caffe/caffeParser/caffeParser.cpp
@@ -24,7 +24,7 @@
 #include "binaryProtoBlob.h"
 #include "google/protobuf/text_format.h"
 #include "half.h"
-#include "NvInferPlugin.h"
+#include "NvInferPluginUtils.h"
 
 using namespace nvinfer1;
 using namespace nvcaffeparser1;
@@ -386,14 +386,31 @@ const IBlobNameToTensor* CaffeParser::parse(INetworkDefinition& network,
 
     for (int i = 0; i < mDeploy->input_size(); i++)
     {
-        DimsCHW dims;
-        if (mDeploy->input_shape_size())
+        Dims dims;
+        if (network.hasImplicitBatchDimension())
         {
-            dims = DimsCHW{(int) mDeploy->input_shape().Get(i).dim().Get(1), (int) mDeploy->input_shape().Get(i).dim().Get(2), (int) mDeploy->input_shape().Get(i).dim().Get(3)};
+            if (mDeploy->input_shape_size())
+            {
+                dims = DimsCHW{(int) mDeploy->input_shape().Get(i).dim().Get(1), (int) mDeploy->input_shape().Get(i).dim().Get(2), (int) mDeploy->input_shape().Get(i).dim().Get(3)};
+            }
+            else
+            {
+                // Deprecated, but still used in a lot of networks
+                dims = DimsCHW{(int) mDeploy->input_dim().Get(i * 4 + 1), (int) mDeploy->input_dim().Get(i * 4 + 2), (int) mDeploy->input_dim().Get(i * 4 + 3)};
+            }
         }
         else
-        { // deprecated, but still used in a lot of networks
-            dims = DimsCHW{(int) mDeploy->input_dim().Get(i * 4 + 1), (int) mDeploy->input_dim().Get(i * 4 + 2), (int) mDeploy->input_dim().Get(i * 4 + 3)};
+        {
+            std::cout << "Warning, setting batch size to 1. Update the dimension after parsing due to using explicit batch size." << std::endl;
+            if (mDeploy->input_shape_size())
+            {
+                dims = DimsNCHW{1, (int) mDeploy->input_shape().Get(i).dim().Get(1), (int) mDeploy->input_shape().Get(i).dim().Get(2), (int) mDeploy->input_shape().Get(i).dim().Get(3)};
+            }
+            else
+            {
+                // Deprecated, but still used in a lot of networks
+                dims = DimsNCHW{1, (int) mDeploy->input_dim().Get(i * 4 + 1), (int) mDeploy->input_dim().Get(i * 4 + 2), (int) mDeploy->input_dim().Get(i * 4 + 3)};
+            }
         }
         ITensor* tensor = network.addInput(mDeploy->input().Get(i).c_str(), DataType::kFLOAT, dims);
         (*mBlobNameToTensor)[mDeploy->input().Get(i)] = tensor;
@@ -576,8 +593,19 @@ const IBlobNameToTensor* CaffeParser::parse(INetworkDefinition& network,
                 }
                 else
                 {
-                    DimsCHW dims{(int) shape.dim().Get(1), (int) shape.dim().Get(2), (int) shape.dim().Get(3)};
-                    ITensor* tensor = network.addInput(layerMsg.top(i).c_str(), DataType::kFLOAT, dims);
+                    Dims d;
+                    if (network.hasImplicitBatchDimension())
+                    {
+                        d = DimsCHW{(int) shape.dim().Get(1), (int) shape.dim().Get(2), (int) shape.dim().Get(3)};
+                    }
+                    else
+                    {
+                        std::cout << "Warning, setting batch size to 1. Update the dimension after parsing due to "
+                                     "using explicit batch size."
+                                  << std::endl;
+                        d = DimsNCHW{1, (int) shape.dim().Get(1), (int) shape.dim().Get(2), (int) shape.dim().Get(3)};
+                    }
+                    ITensor* tensor = network.addInput(layerMsg.top(i).c_str(), DataType::kFLOAT, d);
                     (*mBlobNameToTensor)[layerMsg.top().Get(i)] = tensor;
                 }
             }
diff --git a/parsers/caffe/caffeParser/caffeParser.h b/parsers/caffe/caffeParser/caffeParser.h
index a01ea2f5..b639d9a1 100644
--- a/parsers/caffe/caffeParser/caffeParser.h
+++ b/parsers/caffe/caffeParser/caffeParser.h
@@ -56,6 +56,8 @@ class CaffeParser : public ICaffeParser
     void setPluginNamespace(const char* libNamespace) override { mPluginNamespace = libNamespace; }
     IBinaryProtoBlob* parseBinaryProto(const char* fileName) override;
     void destroy() override { delete this; }
+    void setErrorRecorder(nvinfer1::IErrorRecorder* recorder) override { (void)recorder; assert(!"TRT- Not implemented."); }
+    nvinfer1::IErrorRecorder* getErrorRecorder() const override { assert(!"TRT- Not implemented."); return nullptr; }
 
 private:
     ~CaffeParser() override;
diff --git a/parsers/caffe/caffeParser/opParsers/opParsers.h b/parsers/caffe/caffeParser/opParsers/opParsers.h
index b5bc48f0..22ab66c1 100644
--- a/parsers/caffe/caffeParser/opParsers/opParsers.h
+++ b/parsers/caffe/caffeParser/opParsers/opParsers.h
@@ -62,6 +62,7 @@ nvinfer1::ILayer* parseLRN(nvinfer1::INetworkDefinition& network, const trtcaffe
 nvinfer1::ILayer* parsePermute(nvinfer1::INetworkDefinition& network, const trtcaffe::LayerParameter& msg, CaffeWeightFactory& /*weightFactory*/, BlobNameToTensor& tensors);
 nvinfer1::ILayer* parsePooling(nvinfer1::INetworkDefinition& network, const trtcaffe::LayerParameter& msg, CaffeWeightFactory& /*weightFactory*/, BlobNameToTensor& tensors);
 nvinfer1::ILayer* parsePower(nvinfer1::INetworkDefinition& network, const trtcaffe::LayerParameter& msg, CaffeWeightFactory& weightFactory, BlobNameToTensor& tensors);
+nvinfer1::ILayer* parsePReLU(nvinfer1::INetworkDefinition& network, const trtcaffe::LayerParameter& msg, CaffeWeightFactory& weightFactory, BlobNameToTensor& tensors);
 nvinfer1::ILayer* parseReduction(nvinfer1::INetworkDefinition& network, const trtcaffe::LayerParameter& msg, CaffeWeightFactory& weightFactory, BlobNameToTensor& tensors);
 nvinfer1::ILayer* parseReLU(nvinfer1::INetworkDefinition& network, const trtcaffe::LayerParameter& msg, CaffeWeightFactory& /*weightFactory*/, BlobNameToTensor& tensors);
 nvinfer1::ILayer* parseReshape(nvinfer1::INetworkDefinition& network, const trtcaffe::LayerParameter& msg, CaffeWeightFactory& /*weightFactory*/, BlobNameToTensor& tensors);
@@ -94,7 +95,8 @@ static std::unordered_map<std::string, LayerParseFn> gParseTable
     {"ELU", parseELU},
     {"BNLL", parseBNLL},
     {"Clip", parseClip},
-    {"AbsVal", parseAbsVal}
+    {"AbsVal", parseAbsVal},
+    {"PReLU", parsePReLU}
 };
 } // namespace nvcaffeparser1
 #endif //TRT_CAFFE_PARSER_OP_PARSERS_H
diff --git a/parsers/caffe/caffeParser/opParsers/parseConcat.cpp b/parsers/caffe/caffeParser/opParsers/parseConcat.cpp
index 97297a8a..565e2695 100644
--- a/parsers/caffe/caffeParser/opParsers/parseConcat.cpp
+++ b/parsers/caffe/caffeParser/opParsers/parseConcat.cpp
@@ -24,9 +24,15 @@ ILayer* parseConcat(INetworkDefinition& network, const trtcaffe::LayerParameter&
     const trtcaffe::ConcatParameter& p = msg.concat_param();
     bool hasAxis = p.has_axis(); // optional parameter
 
-    if (hasAxis && p.axis() <= 0)
+    if (hasAxis && p.axis() < 0)
     {
-        std::cout << "Caffe parser: Concat along batch axis or negative axis is not supported." << std::endl;
+        std::cout << "Caffe parser: Concat negative axis is not supported." << std::endl;
+        return nullptr;
+    }
+    if (network.hasImplicitBatchDimension() && p.axis() == 0)
+    {
+        std::cout << "Caffe parser: Concat across batch axis with implicit batch dimensions is not supported."
+                  << std::endl;
         return nullptr;
     }
 
@@ -42,9 +48,9 @@ ILayer* parseConcat(INetworkDefinition& network, const trtcaffe::LayerParameter&
     // Rely on the default axis setting inside TRT which takes into account NPCHW and higher dimensional input.
     if (hasAxis)
     {
-        concat->setAxis(p.axis() - 1);
+        concat->setAxis(p.axis() - static_cast<int>(network.hasImplicitBatchDimension()));
     }
 
     return concat;
 }
-} //namespace nvcaffeparser1
\ No newline at end of file
+} //namespace nvcaffeparser1
diff --git a/parsers/caffe/caffeParser/opParsers/parseConv.cpp b/parsers/caffe/caffeParser/opParsers/parseConv.cpp
index ec76e975..036c9028 100644
--- a/parsers/caffe/caffeParser/opParsers/parseConv.cpp
+++ b/parsers/caffe/caffeParser/opParsers/parseConv.cpp
@@ -41,7 +41,8 @@ ILayer* parseConvolution(INetworkDefinition& network, const trtcaffe::LayerParam
 
     weightFactory.convert(kernelWeights);
     weightFactory.convert(biasWeights);
-    auto layer = network.addConvolution(*tensors[msg.bottom(0)], nbOutputs, DimsHW{kernelH, kernelW}, kernelWeights, biasWeights);
+    auto inTensor = tensors[msg.bottom(0)];
+    auto layer = network.addConvolution(*inTensor, nbOutputs, DimsHW{kernelH, kernelW}, kernelWeights, biasWeights);
 
     if (layer)
     {
diff --git a/parsers/caffe/caffeParser/opParsers/parsePReLU.cpp b/parsers/caffe/caffeParser/opParsers/parsePReLU.cpp
new file mode 100644
index 00000000..0c9c3c21
--- /dev/null
+++ b/parsers/caffe/caffeParser/opParsers/parsePReLU.cpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "opParsers.h"
+
+using namespace nvinfer1;
+
+namespace nvcaffeparser1
+{
+ILayer* parsePReLU(INetworkDefinition& network, const trtcaffe::LayerParameter& msg, CaffeWeightFactory& weightFactory,
+                   BlobNameToTensor& tensors)
+{
+    // Caffe stores the slopes as weights rather than as a tensor, and only supports different slopes
+    // per channel
+    if (!checkBlobs(msg, 1, 1))
+    {
+        return nullptr;
+    }
+
+    const trtcaffe::PReLUParameter& p = msg.prelu_param();
+    bool channelShared = p.has_channel_shared() ? p.channel_shared() : false;
+    auto inputDims = tensors[msg.bottom(0)]->getDimensions();
+    if (inputDims.nbDims < 2)
+    {
+        return nullptr;
+    }
+    int nWeights = channelShared ? 1 : inputDims.d[1]; // Caffe treats second input dimension as channels
+    Dims slopesDims{inputDims.nbDims, {1}, {DimensionType::kSPATIAL}};
+    slopesDims.d[1] = nWeights;
+
+    Weights w = weightFactory.isInitialized() ? weightFactory(msg.name(), WeightType::kGENERIC) :
+                weightFactory.allocateWeights(nWeights, std::uniform_real_distribution<float>(0.F, 1.F));
+    auto constLayer = network.addConstant(slopesDims, w);
+    return network.addParametricReLU(*tensors[msg.bottom(0)], *constLayer->getOutput(0));
+}
+} //namespace nvcaffeparser1
\ No newline at end of file
diff --git a/parsers/caffe/caffeParser/opParsers/parseReduction.cpp b/parsers/caffe/caffeParser/opParsers/parseReduction.cpp
index 7f6f39b4..8358cb9a 100644
--- a/parsers/caffe/caffeParser/opParsers/parseReduction.cpp
+++ b/parsers/caffe/caffeParser/opParsers/parseReduction.cpp
@@ -57,31 +57,54 @@ ILayer* parseReduction(INetworkDefinition& network, const trtcaffe::LayerParamet
     axis = (axis < 0) ? 4 + axis : axis;                // axis negative number correction
     float coeff = hasCoeff ? p.coeff() : 1.0;           // default is 1
 
-    // acceptable axis values: 1, 2, 3, -1, -2, -3
-    // unacceptable axis values: 0 and anything else
-    // acceptable corrected axis values: 1, 2, 3
-    // unacceptable corrected axis values: 0 and anything else
+    // With implicit batch dimensions:
+    //   acceptable axis values: 1, 2, 3, -1, -2, -3
+    //   unacceptable axis values: 0 and anything else
+    //   acceptable corrected axis values: 1, 2, 3
+    //   unacceptable corrected axis values: 0 and anything else
+    //
+    // With implicit batch dimensions:
+    //   acceptable axis values: 1, 2, 3, 0, -1, -2, -3
+    //   unacceptable axis values: anything else
+    //   acceptable corrected axis values: 0, 1, 2, 3
+    //   unacceptable corrected axis values: 0
+    //
     // protect against "garbage" input arguments
-    bool axisAbort = (axis != 1 && axis != 2 && axis != 3);
-
-    if (axisAbort)
+    if (axis < 0 || axis > 3)
+    {
+        std::cout << "Caffe Parser: Invalid axis in reduction layer - can only reduce NCHW input." << std::endl;
+        return nullptr;
+    }
+    if (network.hasImplicitBatchDimension() && axis == 0)
     {
-        std::cout << "Caffe Parser: Invalid axis in reduction layer - cannot reduce over batch size dimension and can only reduce NCHW input" << std::endl;
+        std::cout << "Caffe Parser: Invalid axis in reduction layer - cannot reduce over batch size dimension."
+                  << std::endl;
         return nullptr;
     }
 
     ReduceOperation op = (operation == MEAN ? ReduceOperation::kAVG : ReduceOperation::kSUM);
-    // corrected axis values are 1, 2, 3
+    // For implicit batch, corrected axis values are 1, 2, 3
     // only reduction along tail dimensions is supported
     // 1 means 111 or 4 + 2 + 1 = 7
     // 2 means 110 or 4 + 2 = 6
     // 3 means 100 or 4
     // Let's employ a bit shift trick instead
-    // 1000 = 8
-    // axis == 1: 1u << (axis - 1) is 1 and so 8 - 1 = 7 or 111
-    // axis == 2: 1u << (axis - 1) is 2 and so 8 - 2 = 6 or 110
-    // axis == 3: 1u << (axis - 1) is 4 and so 8 - 4 = 4 or 100
-    uint32_t reduceAxes = 8 - (1u << (axis - 1));
+    // 10000 = 16
+    // axis == 1: 1u << axis is 2 and so (16 - 2) >> 1 = 7 or 111
+    // axis == 2: 1u << axis is 4 and so (16 - 4) >> 1 = 6 or 110
+    // axis == 3: 1u << axis is 8 and so (16 - 8) >> 1 = 4 or 100
+    //
+    // For explicit batch, corrected axis values are 0, 1, 2, 3
+    // 1 means 1111 or 8 + 4 + 2 + 1 = 15
+    // 2 means 1110 or 8 + 4 + 2 = 14
+    // 3 means 1100 or 8 + 4 = 12
+    // 4 means 1000 or 8
+    // 10000 = 16
+    // axis == 0: 1u << axis is 1 and so 16 - 1 = 15 or 1111
+    // axis == 1: 1u << axis is 2 and so 16 - 2 = 14 or 1110
+    // axis == 2: 1u << axis is 4 and so 16 - 4 = 12 or 1100
+    // axis == 3: 1u << axis is 8 and so 16 - 8 = 8 or 1000
+    uint32_t reduceAxes = (16 - (1u << axis)) >> static_cast<int>(network.hasImplicitBatchDimension());
 
     ITensor* input = tensors[msg.bottom(0)];
     ILayer* returnVal = nullptr;
diff --git a/parsers/caffe/caffeParser/opParsers/parseReshape.cpp b/parsers/caffe/caffeParser/opParsers/parseReshape.cpp
index 56bff35f..ed29399f 100644
--- a/parsers/caffe/caffeParser/opParsers/parseReshape.cpp
+++ b/parsers/caffe/caffeParser/opParsers/parseReshape.cpp
@@ -32,15 +32,18 @@ ILayer* parseReshape(INetworkDefinition& network, const trtcaffe::LayerParameter
 
     const ::trtcaffe::BlobShape& shape = p.shape();
     // Check that N (batch dim) is 0. TensorRT does not support reshape in batch dimension
-    if ((axis == 0) && (shape.dim(0) != 0))
+    if (network.hasImplicitBatchDimension() && (axis == 0) && (shape.dim(0) != 0))
     {
-        std::cout << "Caffe Parser: Invalid reshape param. TensorRT does not support reshape in N (batch) dimension" << std::endl;
+        std::cout << "Caffe Parser: Invalid reshape param. TensorRT does not support reshape in N (batch) dimension"
+                  << std::endl;
         return nullptr;
     }
 
     // Handle axis and dims parameters
     int axStart = std::max(0, axis - 1);
-    int axEnd = p.has_num_axes() ? std::max(0, axis - 1 + p.num_axes()) : bottomDims.nbDims;
+    int axEnd = p.has_num_axes()
+        ? std::max(0, axis - static_cast<int>(network.hasImplicitBatchDimension()) + p.num_axes())
+        : bottomDims.nbDims;
     std::vector<int> reshapeDims;
 
     reshapeDims.reserve(axStart);
@@ -52,7 +55,7 @@ ILayer* parseReshape(INetworkDefinition& network, const trtcaffe::LayerParameter
     for (int i = 0; i < shape.dim_size(); i++)
     {
         // skip first 0 (batch)
-        if (axis == 0 && i == 0)
+        if (network.hasImplicitBatchDimension() && axis == 0 && i == 0)
         {
             continue;
         }
@@ -103,4 +106,4 @@ ILayer* parseReshape(INetworkDefinition& network, const trtcaffe::LayerParameter
     layer->setReshapeDimensions(topDims);
     return layer;
 }
-} //namespace nvcaffeparser1
\ No newline at end of file
+} //namespace nvcaffeparser1
diff --git a/parsers/caffe/caffeParser/opParsers/parseSoftMax.cpp b/parsers/caffe/caffeParser/opParsers/parseSoftMax.cpp
index 069a9b3d..13953c8a 100644
--- a/parsers/caffe/caffeParser/opParsers/parseSoftMax.cpp
+++ b/parsers/caffe/caffeParser/opParsers/parseSoftMax.cpp
@@ -39,11 +39,19 @@ ILayer* parseSoftMax(INetworkDefinition& network, const trtcaffe::LayerParameter
     bool hasAxis = p.has_axis();       // optional parameter
     int axis = hasAxis ? p.axis() : 1; // default is 1
 
-    bool axisAbort = (axis <= 0) || (axis > 3) || (axis > nbDims);
+    if (network.hasImplicitBatchDimension() && axis == 0)
+    {
+        std::cout << "Caffe Parser: Invalid axis in softmax layer - TensorRT does not support softmax across the batch "
+                     "axis with implicit batch dimensions networks."
+                  << std::endl;
+        return nullptr;
+    }
 
-    if (axisAbort)
+    if (axis < 0 || axis > 3 || (axis > nbDims))
     {
-        std::cout << "Caffe Parser: Invalid axis in softmax layer - Cannot perform softmax along batch size dimension and expects NCHW input. Negative axis is not supported in TensorRT, please use positive axis indexing" << std::endl;
+        std::cout << "Caffe Parser: Invalid axis in softmax layer - TensorRT expects NCHW input. Negative axis is not "
+                     "supported in TensorRT, please use positive axis indexing"
+                  << std::endl;
         return nullptr;
     }
 
@@ -54,9 +62,9 @@ ILayer* parseSoftMax(INetworkDefinition& network, const trtcaffe::LayerParameter
     // NPCHW -> default axis when setAxes is not called will be 2 (the C dimension)
     if (hasAxis)
     {
-        uint32_t axes = 1u << (axis - 1);
+        uint32_t axes = 1u << (axis - static_cast<int>(network.hasImplicitBatchDimension()));
         softmax->setAxes(axes);
     }
     return softmax;
 }
-} //namespace nvcaffeparser1
\ No newline at end of file
+} //namespace nvcaffeparser1
diff --git a/parsers/onnx b/parsers/onnx
index 8b52755b..400df73d 160000
--- a/parsers/onnx
+++ b/parsers/onnx
@@ -1 +1 @@
-Subproject commit 8b52755beb781366f3bd2a34b3a0be0fcd4c0c33
+Subproject commit 400df73d8fb1f1b447bad4df4a6f237c75f165e5
diff --git a/plugin/CMakeLists.txt b/plugin/CMakeLists.txt
index 71ded7c1..83792207 100644
--- a/plugin/CMakeLists.txt
+++ b/plugin/CMakeLists.txt
@@ -44,6 +44,12 @@ set(PLUGIN_LISTS
     cropAndResizePlugin
     proposalPlugin
     batchTilePlugin
+    detectionLayerPlugin
+    proposalLayerPlugin
+    pyramidROIAlignPlugin
+    resizeNearestPlugin
+    specialSlicePlugin
+    instanceNormalizationPlugin
     )
 
 include_directories(common common/kernels)
@@ -81,7 +87,7 @@ set_target_properties(${SHARED_TARGET} PROPERTIES
     RUNTIME_OUTPUT_DIRECTORY "${TRT_BIN_DIR}"
 )
 
-set_target_properties(${SHARED_TARGET} PROPERTIES LINK_FLAGS "-Wl,--exclude-libs,ALL -Wl,--version-script=${PLUGIN_EXPORT_MAP}")
+set_target_properties(${SHARED_TARGET} PROPERTIES LINK_FLAGS "-Wl,--exclude-libs,ALL -Wl,--version-script=${PLUGIN_EXPORT_MAP} -Wl,--no-undefined")
 
 set_target_properties(${SHARED_TARGET} PROPERTIES DEBUG_POSTFIX ${TRT_DEBUG_POSTFIX})
 
@@ -92,6 +98,7 @@ set_property(TARGET ${SHARED_TARGET} PROPERTY CUDA_STANDARD 11)
 target_link_libraries(${SHARED_TARGET}
     ${CUBLAS_LIB}
     ${CUDART_LIB}
+    ${CUDNN_LIB}
     nvinfer
 )
 
diff --git a/plugin/InferPlugin.cpp b/plugin/InferPlugin.cpp
index 301f063c..329b2547 100644
--- a/plugin/InferPlugin.cpp
+++ b/plugin/InferPlugin.cpp
@@ -23,7 +23,6 @@ using namespace nvinfer1;
 using namespace nvinfer1::plugin;
 
 #include "batchedNMSPlugin/batchedNMSPlugin.h"
-#include "batchTilePlugin/batchTilePlugin.h"
 #include "cropAndResizePlugin/cropAndResizePlugin.h"
 #include "flattenConcat/flattenConcat.h"
 #include "gridAnchorPlugin/gridAnchorPlugin.h"
@@ -35,6 +34,14 @@ using namespace nvinfer1::plugin;
 #include "regionPlugin/regionPlugin.h"
 #include "reorgPlugin/reorgPlugin.h"
 
+#include "batchTilePlugin/batchTilePlugin.h"
+#include "detectionLayerPlugin/detectionLayerPlugin.h"
+#include "proposalLayerPlugin/proposalLayerPlugin.h"
+#include "pyramidROIAlignPlugin/pyramidROIAlignPlugin.h"
+#include "resizeNearestPlugin/resizeNearestPlugin.h"
+#include "specialSlicePlugin/specialSlicePlugin.h"
+#include "instanceNormalizationPlugin/instanceNormalizationPlugin.h"
+
 using nvinfer1::plugin::RPROIParams;
 
 namespace nvinfer1
@@ -42,7 +49,7 @@ namespace nvinfer1
 
 namespace plugin
 {
-ILogger* gLogger {};
+ILogger* gLogger{};
 
 // Instances of this class are statically constructed in initializePlugin.
 // This ensures that each plugin is only registered a single time, as further calls to
@@ -105,6 +112,12 @@ bool initLibNvInferPlugins(void* logger, const char* libNamespace)
     initializePlugin<nvinfer1::plugin::CropAndResizePluginCreator>(logger, libNamespace);
     initializePlugin<nvinfer1::plugin::ProposalPluginCreator>(logger, libNamespace);
     initializePlugin<nvinfer1::plugin::BatchTilePluginCreator>(logger, libNamespace);
+    initializePlugin<nvinfer1::plugin::DetectionLayerPluginCreator>(logger, libNamespace);
+    initializePlugin<nvinfer1::plugin::ProposalLayerPluginCreator>(logger, libNamespace);
+    initializePlugin<nvinfer1::plugin::PyramidROIAlignPluginCreator>(logger, libNamespace);
+    initializePlugin<nvinfer1::plugin::ResizeNearestPluginCreator>(logger, libNamespace);
+    initializePlugin<nvinfer1::plugin::SpecialSlicePluginCreator>(logger, libNamespace);
+    initializePlugin<nvinfer1::plugin::InstanceNormalizationPluginCreator>(logger, libNamespace);
     return true;
 }
 } // extern "C"
diff --git a/plugin/batchedNMSPlugin/README.md b/plugin/batchedNMSPlugin/README.md
index 59e34761..94f879e1 100644
--- a/plugin/batchedNMSPlugin/README.md
+++ b/plugin/batchedNMSPlugin/README.md
@@ -110,4 +110,4 @@ This is the first release of this `README.md` file.
 
 ## Known issues
 
-There are no known issues in this plugin.
\ No newline at end of file
+When running `cub::DeviceSegmentedRadixSort::SortPairsDescending` with `cuda-memcheck --tool racecheck`, it will not work correctly.
diff --git a/plugin/batchedNMSPlugin/batchedNMSPlugin.cpp b/plugin/batchedNMSPlugin/batchedNMSPlugin.cpp
index 018d5941..febcd59d 100644
--- a/plugin/batchedNMSPlugin/batchedNMSPlugin.cpp
+++ b/plugin/batchedNMSPlugin/batchedNMSPlugin.cpp
@@ -76,7 +76,7 @@ Dims BatchedNMSPlugin::getOutputDimensions(int index, const Dims* inputs, int nb
     // num_detections
     if (index == 0)
     {
-        Dims dim0;
+        Dims dim0{};
         dim0.nbDims = 0;
         return dim0;
     }
@@ -86,7 +86,7 @@ Dims BatchedNMSPlugin::getOutputDimensions(int index, const Dims* inputs, int nb
         return DimsHW(param.keepTopK, 4);
     }
     // nmsed_scores or nmsed_classes
-    Dims dim1;
+    Dims dim1{};
     dim1.nbDims = 1;
     dim1.d[0] = param.keepTopK;
     return dim1;
diff --git a/plugin/batchedNMSPlugin/batchedNMSPlugin.h b/plugin/batchedNMSPlugin/batchedNMSPlugin.h
index 7423f94b..4e63d8c6 100644
--- a/plugin/batchedNMSPlugin/batchedNMSPlugin.h
+++ b/plugin/batchedNMSPlugin/batchedNMSPlugin.h
@@ -82,10 +82,12 @@ class BatchedNMSPlugin : public IPluginV2Ext
     void setClipParam(bool clip);
 
 private:
-    NMSParameters param;
-    int boxesSize, scoresSize, numPriors;
+    NMSParameters param{};
+    int boxesSize{};
+    int scoresSize{};
+    int numPriors{};
     std::string mNamespace;
-    bool mClipBoxes;
+    bool mClipBoxes{};
     const char* mPluginNamespace;
 };
 
diff --git a/plugin/common/cudaDriverWrapper.cu b/plugin/common/cudaDriverWrapper.cu
new file mode 100644
index 00000000..eec1ad2a
--- /dev/null
+++ b/plugin/common/cudaDriverWrapper.cu
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef __linux__
+#ifdef __x86_64__
+#include <cuda.h>
+#include <stdio.h>
+#include <dlfcn.h>
+#include "cudaDriverWrapper.h"
+#include <assert.h>
+
+using namespace nvinfer1;
+
+CUDADriverWrapper::CUDADriverWrapper()
+{
+    handle = dlopen("libcuda.so.1", RTLD_LAZY);
+    assert(handle != nullptr);
+
+    auto load_sym = [](void *handle, const char *name) {
+        void *ret = dlsym(handle, name);
+        assert(ret != nullptr);
+        return ret;
+    };
+
+    *(void**)(&_cuGetErrorName) = load_sym(handle, "cuGetErrorName");
+    *(void**)(&_cuFuncSetAttribute) = load_sym(handle, "cuFuncSetAttribute");
+    *(void**)(&_cuLinkComplete) = load_sym(handle, "cuLinkComplete");
+    *(void**)(&_cuModuleUnload) = load_sym(handle, "cuModuleUnload");
+    *(void**)(&_cuLinkDestroy) = load_sym(handle, "cuLinkDestroy");
+    *(void**)(&_cuModuleLoadData) = load_sym(handle, "cuModuleLoadData");
+    *(void**)(&_cuLinkCreate) = load_sym(handle, "cuLinkCreate_v2");
+    *(void**)(&_cuModuleGetFunction) = load_sym(handle, "cuModuleGetFunction");
+    *(void**)(&_cuLinkAddFile) = load_sym(handle, "cuLinkAddFile_v2");
+    *(void**)(&_cuLinkAddData) = load_sym(handle, "cuLinkAddData_v2");
+    *(void**)(&_cuLaunchCooperativeKernel) = load_sym(handle, "cuLaunchCooperativeKernel");
+}
+
+CUDADriverWrapper::~CUDADriverWrapper()
+{
+    dlclose(handle);
+}
+
+CUresult CUDADriverWrapper::cuGetErrorName(CUresult error, const char** pStr) const
+{
+    return (*_cuGetErrorName)(error, pStr);
+}
+
+CUresult CUDADriverWrapper::cuFuncSetAttribute(CUfunction hfunc, CUfunction_attribute attrib, int  value) const
+{
+    return (*_cuFuncSetAttribute)(hfunc, attrib, value);
+}
+
+CUresult CUDADriverWrapper::cuLinkComplete(CUlinkState state, void** cubinOut, size_t* sizeOut) const
+{
+    return (*_cuLinkComplete)(state, cubinOut, sizeOut);
+}
+
+CUresult CUDADriverWrapper::cuModuleUnload(CUmodule hmod) const
+{
+    return (*_cuModuleUnload)(hmod);
+}
+
+CUresult CUDADriverWrapper::cuLinkDestroy(CUlinkState state) const
+{
+    return (*_cuLinkDestroy)(state);
+}
+
+CUresult CUDADriverWrapper::cuModuleLoadData(CUmodule* module, const void* image) const
+{
+    return (*_cuModuleLoadData)(module, image);
+}
+
+CUresult CUDADriverWrapper::cuLinkCreate(unsigned int numOptions, CUjit_option* options, void** optionValues, CUlinkState* stateOut) const
+{
+    return (*_cuLinkCreate)(numOptions, options, optionValues, stateOut);
+}
+
+CUresult CUDADriverWrapper::cuModuleGetFunction(CUfunction* hfunc, CUmodule hmod, const char* name) const
+{
+    return (*_cuModuleGetFunction)(hfunc, hmod, name);
+}
+
+CUresult CUDADriverWrapper::cuLinkAddFile(CUlinkState state, CUjitInputType type, const char* path, unsigned int numOptions, CUjit_option* options, void** optionValues) const
+{
+    return (*_cuLinkAddFile)(state, type, path, numOptions, options, optionValues);
+}
+
+CUresult CUDADriverWrapper::cuLinkAddData(CUlinkState state, CUjitInputType type, void* data, size_t size, const char* name, unsigned int numOptions, CUjit_option* options, void** optionValues) const
+{
+    return (*_cuLinkAddData)(state, type, data, size, name, numOptions, options, optionValues);
+}
+
+CUresult CUDADriverWrapper::cuLaunchCooperativeKernel (CUfunction f, unsigned int  gridDimX, unsigned int  gridDimY, unsigned int  gridDimZ,
+    unsigned int  blockDimX, unsigned int  blockDimY, unsigned int  blockDimZ, unsigned int  sharedMemBytes, CUstream hStream, void** kernelParams) const
+{
+    return (*_cuLaunchCooperativeKernel)(f, gridDimX, gridDimY, gridDimZ,
+        blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams);
+}
+
+#endif // __x86_64__
+#endif //__linux__
diff --git a/plugin/common/cudaDriverWrapper.h b/plugin/common/cudaDriverWrapper.h
new file mode 100644
index 00000000..da74420e
--- /dev/null
+++ b/plugin/common/cudaDriverWrapper.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CUDA_DRIVER_WRAPPER_H
+#define CUDA_DRIVER_WRAPPER_H
+
+#ifdef __linux__
+#ifdef __x86_64__
+#include <cstdio>
+#include <cuda.h>
+#include <dlfcn.h>
+
+namespace nvinfer1
+{
+class CUDADriverWrapper
+{
+public:
+    CUDADriverWrapper();
+
+    ~CUDADriverWrapper();
+
+    CUresult cuGetErrorName(CUresult error, const char** pStr) const;
+
+    CUresult cuFuncSetAttribute(CUfunction hfunc, CUfunction_attribute attrib, int value) const;
+
+    CUresult cuLinkComplete(CUlinkState state, void** cubinOut, size_t* sizeOut) const;
+
+    CUresult cuModuleUnload(CUmodule hmod) const;
+
+    CUresult cuLinkDestroy(CUlinkState state) const;
+
+    CUresult cuModuleLoadData(CUmodule* module, const void* image) const;
+
+    CUresult cuLinkCreate(
+        unsigned int numOptions, CUjit_option* options, void** optionValues, CUlinkState* stateOut) const;
+
+    CUresult cuModuleGetFunction(CUfunction* hfunc, CUmodule hmod, const char* name) const;
+
+    CUresult cuLinkAddFile(CUlinkState state, CUjitInputType type, const char* path, unsigned int numOptions,
+        CUjit_option* options, void** optionValues) const;
+
+    CUresult cuLinkAddData(CUlinkState state, CUjitInputType type, void* data, size_t size, const char* name,
+        unsigned int numOptions, CUjit_option* options, void** optionValues) const;
+
+    CUresult cuLaunchCooperativeKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY,
+        unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ,
+        unsigned int sharedMemBytes, CUstream hStream, void** kernelParams) const;
+
+private:
+    void* handle;
+    CUresult (*_cuGetErrorName)(CUresult, const char**);
+    CUresult (*_cuFuncSetAttribute)(CUfunction, CUfunction_attribute, int);
+    CUresult (*_cuLinkComplete)(CUlinkState, void**, size_t*);
+    CUresult (*_cuModuleUnload)(CUmodule);
+    CUresult (*_cuLinkDestroy)(CUlinkState);
+    CUresult (*_cuLinkCreate)(unsigned int, CUjit_option*, void**, CUlinkState*);
+    CUresult (*_cuModuleLoadData)(CUmodule*, const void*);
+    CUresult (*_cuModuleGetFunction)(CUfunction*, CUmodule, const char*);
+    CUresult (*_cuLinkAddFile)(CUlinkState, CUjitInputType, const char*, unsigned int, CUjit_option*, void**);
+    CUresult (*_cuLinkAddData)(
+        CUlinkState, CUjitInputType, void*, size_t, const char*, unsigned int, CUjit_option*, void**);
+    CUresult (*_cuLaunchCooperativeKernel)(CUfunction, unsigned int, unsigned int, unsigned int, unsigned int,
+        unsigned int, unsigned int, unsigned int, CUstream, void**);
+};
+} // namespace nvinfer1
+#endif // __x86_64__
+#endif //__linux__
+#endif // CUDA_DRIVER_WRAPPER_H
diff --git a/plugin/common/kernels/allClassNMS.cu b/plugin/common/kernels/allClassNMS.cu
index 7eed4156..8dca8693 100644
--- a/plugin/common/kernels/allClassNMS.cu
+++ b/plugin/common/kernels/allClassNMS.cu
@@ -139,6 +139,8 @@ __global__ void allClassNMS_kernel(
         Bbox<T_BBOX> loc_bbox[TSIZE];
 
 // initialize Bbox, Bboxinfo, kept_bboxinfo_flag
+        // Eliminate shared memory RAW hazard  
+        __syncthreads();
 #pragma unroll
         for (int t = 0; t < TSIZE; t++)
         {
@@ -186,6 +188,9 @@ __global__ void allClassNMS_kernel(
             ref_bbox.xmax = flipXY ? bbox_data[ref_bbox_idx * 4 + 3] : bbox_data[ref_bbox_idx * 4 + 2];
             ref_bbox.ymax = flipXY ? bbox_data[ref_bbox_idx * 4 + 2] : bbox_data[ref_bbox_idx * 4 + 3];
 
+            // Eliminate shared memory RAW hazard  
+            __syncthreads();
+
             for (int t = 0; t < TSIZE; t++)
             {
                 const int cur_idx = threadIdx.x + blockDim.x * t;
diff --git a/plugin/common/kernels/maskRCNNKernels.cu b/plugin/common/kernels/maskRCNNKernels.cu
new file mode 100644
index 00000000..1f831ea4
--- /dev/null
+++ b/plugin/common/kernels/maskRCNNKernels.cu
@@ -0,0 +1,1537 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <assert.h>
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+
+#include "maskRCNNKernels.h"
+#include "plugin.h"
+#include <NvInfer.h>
+#include <assert.h>
+#include <cub/cub.cuh>
+#include <iostream>
+#include <stdio.h>
+#include <thrust/device_ptr.h>
+#include <thrust/fill.h>
+
+#define DUBUG_KERNEL 0
+#define DUBUG_BATCH 0
+#define DEBUG_T 1
+
+#define dMIN(a, b) ((a) < (b) ? (a) : (b))
+#define dMAX(a, b) ((a) > (b) ? (a) : (b))
+#define dCLAMP(x, xMin, xMax) ((x) > (xMin) ? ((x) < (xMax) ? (x) : (xMax)) : (xMin))
+
+template <typename BoxType>
+struct BBoxT
+{
+    BoxType y1, x1, y2, x2;
+};
+
+template <typename DType>
+__global__ void argMaxReset_kernel(
+    int samples, int NClass, const DType* in_scores, const int* maxIdx, DType* out_scores)
+{
+    int idx = threadIdx.x + blockIdx.x * blockDim.x;
+    int max_idx = samples * NClass;
+    if (idx >= max_idx)
+        return;
+
+    int sampleIdx = idx / NClass;
+    int classIdx = idx % NClass;
+    if (classIdx != maxIdx[sampleIdx])
+        out_scores[idx] = 0;
+    else
+        out_scores[idx] = in_scores[idx];
+}
+
+template <typename DType>
+struct ScanItem
+{
+    DType data;
+    int idx;
+};
+
+template <typename DType>
+struct GreaterItem
+{
+    __host__ __device__ __forceinline__ ScanItem<DType> operator()(
+        const ScanItem<DType>& a, const ScanItem<DType>& b) const
+    {
+        return (a.data > b.data ? a : b);
+    }
+};
+
+template <typename DType>
+__global__ void resetMemValue_kernel(void* outPtr, int samples, float val)
+{
+    DType* out = static_cast<DType*>(outPtr);
+    int loop = gridDim.x * blockDim.x;
+    for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < samples; idx += loop)
+    {
+        out[idx] = (DType) val;
+    }
+}
+
+// blockDim.x : NClass
+// GroupDim.x : sample count
+// GroupDim.y : batch N
+// outScore : DType[ N * sample * 1 ]
+// outLabel : int[ N * sample * 1 ]
+// outBbox : int[ N * sample * 4 ]
+template <typename DType, typename BoxType, int Threads = 32>
+__global__ void argMaxGroup_kernel(int samples, int NClass, const void* inScorePtr, const void* inBboxPtr,
+    const void* validSampleCountPtr, void* outScorePtr, void* outLabelPtr, void* outBboxPtr)
+{
+    const DType* inScore = static_cast<const DType*>(inScorePtr);
+    const BoxType* inBbox = static_cast<const BoxType*>(inBboxPtr);
+    const int* validSampleCount = static_cast<const int*>(validSampleCountPtr);
+    DType* outScore = static_cast<DType*>(outScorePtr);
+    BoxType* outLabel = static_cast<BoxType*>(outLabelPtr);
+    BoxType* outBbox = static_cast<BoxType*>(outBboxPtr);
+
+    const int N = blockIdx.y;
+    const int validSamples = validSampleCount[N];
+
+    typedef ScanItem<DType> ScanItemD;
+    typedef cub::BlockReduce<ScanItemD, Threads> BlockReduce;
+
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+
+    for (int iSample = blockIdx.x; iSample < validSamples; iSample += gridDim.x)
+    {
+        int classOffset = (N * samples + iSample) * NClass; // start from [batch, count, class0]
+        // total IPerThread * blockDim
+        ScanItemD maxItem = {0.0f, -1};
+        for (int i = 0; i < NClass; i += Threads)
+        {
+            int curIdx = i + threadIdx.x;
+            ScanItemD item = {0.0f, -1};
+            if (curIdx < NClass)
+            {
+                item.data = inScore[classOffset + curIdx];
+                item.idx = curIdx;
+            }
+            const int validNum = (NClass - i > Threads ? Threads : NClass - i);
+            ScanItemD aggregate = BlockReduce(temp_storage).Reduce(item, GreaterItem<DType>(), validNum);
+            __syncthreads();
+            if (aggregate.data > maxItem.data)
+            {
+                maxItem = aggregate;
+            }
+#if DUBUG_KERNEL
+            if (N == DUBUG_BATCH && threadIdx.x == 0 && iSample < 15 /*&& maxItem.idx >= 32*/)
+            {
+                printf("argMaxGroup N:%d, iSample:%d, maxItem(score:%.3f, idx:%d)validReduceNum:%d\n", N, iSample,
+                    (float) maxItem.data, maxItem.idx, validNum);
+            }
+#endif
+        }
+
+        const int dstOffset = N * samples + iSample;
+        if (threadIdx.x == 0)
+        {
+            outScore[dstOffset] = maxItem.data;
+            outLabel[dstOffset] = (BoxType) maxItem.idx;
+            outBbox[dstOffset * 4] = inBbox[(classOffset + maxItem.idx) * 4];
+            outBbox[dstOffset * 4 + 1] = inBbox[(classOffset + maxItem.idx) * 4 + 1];
+            outBbox[dstOffset * 4 + 2] = inBbox[(classOffset + maxItem.idx) * 4 + 2];
+            outBbox[dstOffset * 4 + 3] = inBbox[(classOffset + maxItem.idx) * 4 + 3];
+        }
+    }
+}
+
+struct BlockClassSumPrefix
+{
+    int total;
+    // Constructor
+    __device__ BlockClassSumPrefix()
+        : total(0)
+    {
+    }
+    // Callback operator to be entered by the first warp of threads in the block.
+    // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+    __device__ int operator()(int aggregate)
+    {
+        int old = total;
+        total += aggregate;
+        return old;
+    }
+};
+
+#define LabelShift (DType)(2.5f)
+#define MinValidScore (DType)(0.01f)
+
+template <typename DType>
+__device__ __forceinline__ DType getKey(DType score, int lable, int NClass)
+{
+    return (lable < 0 ? (DType) 0 : ((DType)(NClass - lable - 1) * LabelShift + score + MinValidScore));
+}
+
+template <typename DType, typename BoxType>
+__device__ __forceinline__ void getScoreLable(DType key, int NClass, DType& score, BoxType& lable)
+{
+    int i = key / LabelShift;
+    score = (key <= MinValidScore ? (DType) 0 : key - (DType) i * LabelShift - MinValidScore);
+    score = dCLAMP(score, (DType) 0, (DType) 1.0);
+    lable = (BoxType)(key <= MinValidScore ? -1 : (NClass - i - 1));
+}
+
+// blockDim.x : threads
+// gridDim.x : batch N
+// validSampleCount INPUT : int [N]
+// classStartPos OUTPUT: int [N * (Class + 1)], need memset to zero before this kernel
+// outScore OUTPUT : DType [N * samples]
+// outLabel OUTPUT : int [N * samples]
+// outSampleIdx OUTPUT : int [N * samples]
+// outValidSampleCount : int [N]
+// IPerThread * Threads >= sample-count
+#define MaxClassNum 255
+template <typename DType, typename BoxType, int Threads = 256, int IPerThread = 4>
+__global__ void sortPerClass_kernel(
+    // int N,
+    int samples, int NClass, int background, float scoreThreshold, const void* validSampleCountPtr,
+    const void* inScorePtr, const void* inLabelPtr, const void* inBboxPtr, void* classStartPosPtr, void* outScorePtr,
+    void* outLabelPtr, void* outSampleIdxPtr, void* outValidSampleCountPtr)
+{
+    typedef cub::BlockExchange<DType, Threads, IPerThread> BlockExchangeKey;
+    typedef cub::BlockExchange<int, Threads, IPerThread> BlockExchangeI;
+    typedef cub::BlockRadixSort<DType, Threads, IPerThread, int> BlockRadixSort;
+    typedef cub::BlockScan<int, Threads> BlockScanClass;
+    __shared__ union
+    {
+        typename BlockExchangeKey::TempStorage storageKey;
+        typename BlockExchangeI::TempStorage storageI;
+        typename BlockRadixSort::TempStorage storageSort;
+        typename BlockScanClass::TempStorage storageScan;
+    } temp_storage;
+    __shared__ int smemClassCount[MaxClassNum];
+    assert(NClass < MaxClassNum);
+    assert(IPerThread * Threads >= samples);
+
+    const int* validSampleCount = static_cast<const int*>(validSampleCountPtr);
+    const DType* inScore = static_cast<const DType*>(inScorePtr);
+    const BoxType* inLabel = static_cast<const BoxType*>(inLabelPtr);
+    int* classStartPos = static_cast<int*>(classStartPosPtr);
+    DType* outScore = static_cast<DType*>(outScorePtr);
+    BoxType* outLabel = static_cast<BoxType*>(outLabelPtr);
+    int* outSampleIdx = static_cast<int*>(outSampleIdxPtr);
+    int* outValidSampleCount = static_cast<int*>(outValidSampleCountPtr);
+
+    for (int s = threadIdx.x; s < NClass + 1; s += blockDim.x)
+    {
+        smemClassCount[s] = 0;
+    }
+
+    int N = blockIdx.x;
+    int blockOffset = N * samples;
+    int validSamples = validSampleCount[N];
+    DType key[IPerThread];
+    int iSample[IPerThread];
+    for (int i = 0; i < IPerThread; ++i)
+    {
+        iSample[i] = -1;
+        key[i] = -1.0f;
+        int curIdx = i * Threads + threadIdx.x;
+        if (curIdx < validSamples)
+        {
+            int label = (int) (inLabel[blockOffset + curIdx]);
+            DType score = inScore[blockOffset + curIdx];
+            if (label != background && label != -1 && score >= (DType) scoreThreshold)
+            {
+                key[i] = getKey(score, label, NClass);
+                iSample[i] = curIdx;
+            }
+        }
+    }
+
+    BlockExchangeKey(temp_storage.storageKey).StripedToBlocked(key);
+    __syncthreads();
+    BlockExchangeI(temp_storage.storageI).StripedToBlocked(iSample);
+    __syncthreads();
+    BlockRadixSort(temp_storage.storageSort).SortDescendingBlockedToStriped(key, iSample);
+    __syncthreads();
+
+    // store Idx
+    cub::StoreDirectStriped<Threads>(threadIdx.x, outSampleIdx + blockOffset, iSample, validSamples);
+    BoxType lable[IPerThread];
+    DType score[IPerThread];
+
+#pragma unroll
+    for (int i = 0; i < IPerThread; ++i)
+    {
+        getScoreLable(key[i], NClass, score[i], lable[i]);
+    }
+    cub::StoreDirectStriped<Threads>(threadIdx.x, outScore + blockOffset, score, validSamples);
+    cub::StoreDirectStriped<Threads>(threadIdx.x, outLabel + blockOffset, lable, validSamples);
+
+    // final
+    for (int i = 0; i < IPerThread; ++i)
+    {
+        if (lable[i] >= (BoxType) 0)
+        {
+            atomicAdd(&smemClassCount[(int) lable[i]], 1);
+        }
+    }
+    __syncthreads();
+
+    int classBlockOffset = N * (NClass + 1); // Exclusive-sum, 1st is 0, last is final sum
+
+#if DUBUG_KERNEL
+    if (N == DUBUG_BATCH && threadIdx.x == 0)
+    {
+        printf("sortPerClass(N:%d) final count of each label, valid samples:%d\n", N, validSamples);
+        for (int k = 0; k < NClass; ++k)
+        {
+            if (smemClassCount[k] > 0)
+                printf("Batch:%d, L:%d, count:%d, \n", N, k, smemClassCount[k]);
+        }
+    }
+    __syncthreads();
+#endif
+
+    BlockClassSumPrefix sumPrefix;
+    for (int s = 0; s < NClass; s += blockDim.x)
+    { // s start from block
+        int iClassSamples = 0;
+        int iClass = s + threadIdx.x;
+        if (iClass < NClass)
+        {
+            iClassSamples = smemClassCount[iClass];
+        }
+        BlockScanClass(temp_storage.storageScan).ExclusiveSum(iClassSamples, iClassSamples, sumPrefix);
+        __syncthreads();
+        if (iClass < NClass)
+        {
+            classStartPos[classBlockOffset + iClass] = iClassSamples;
+        }
+    }
+    if (threadIdx.x == 0)
+    {
+        classStartPos[classBlockOffset + NClass] = sumPrefix.total;
+        assert(sumPrefix.total <= validSamples); // background data removed.
+        outValidSampleCount[N] = sumPrefix.total;
+#if DUBUG_KERNEL
+        if (N == DUBUG_BATCH)
+            printf("After sortPerClass, batch:%d valid samples total:%d\n", N, sumPrefix.total);
+#endif
+    }
+}
+
+template <typename DType>
+__device__ __forceinline__ BBoxT<DType> readBbox(const BBoxT<DType>* inBbox, int idx)
+{
+    BBoxT<DType> ret = ((BBoxT<DType>*) (inBbox))[idx];
+    return ret;
+}
+
+template <typename DType>
+__device__ __forceinline__ DType boxIoU(const BBoxT<DType>& a, const BBoxT<DType>& b)
+{
+    BBoxT<DType> overlap = {
+        dMAX(a.y1, b.y1), dMAX(a.x1, b.x1), dMIN(a.y2, b.y2), dMIN(a.x2, b.x2),
+    };
+    DType oW = overlap.x2 - overlap.x1;
+    DType oH = overlap.y2 - overlap.y1;
+    if (oW < (DType) 0 || oH < (DType) 0)
+        return (DType) 0;
+    DType oA = oW * oH;
+    return (oA / ((a.y2 - a.y1) * (a.x2 - a.x1) + (b.y2 - b.y1) * (b.x2 - b.x1) - oA));
+}
+
+// PerClassNMS
+// gridDim.x : batch-N
+// blockDim.x : Threads
+// ItemsPerThreads : = divUp(samples, Threads)
+// outFlagSamples OUT: int [N * samples]
+template <typename DType, typename BoxType, int Threads = 256, int ItemsPerThreads = 4>
+__global__ void PerClassNMS_kernel(
+    // int N,
+    int samples, int NClass, const float nmsThreshold, const void* validSampleCountPtr,
+    // const void *inScorePtr,
+    const void* inLabelPtr, const void* inBboxPtr, const void* inBboxRefIdxPtr, const void* classStartsPtr,
+    void* outFlagSamplesPtr)
+{
+    typedef BBoxT<BoxType> BBox;
+    __shared__ struct
+    {
+        BBox refBox[MaxClassNum];
+        int endIdx[MaxClassNum];
+        int refIdx[MaxClassNum + 1];
+        bool markSamples[Threads * ItemsPerThreads];
+        int done;
+    } smemClasses;
+    assert(NClass + 1 < MaxClassNum);
+    assert(samples <= Threads * ItemsPerThreads);
+
+    const int* validSampleCount = static_cast<const int*>(validSampleCountPtr);
+    // const DType *inScore = static_cast<const DType *>(inScorePtr);
+    const BoxType* inLabel = static_cast<const BoxType*>(inLabelPtr);
+    const BBox* inBbox = static_cast<const BBox*>(inBboxPtr);
+    const int* inBboxRefIdx = static_cast<const int*>(inBboxRefIdxPtr);
+    const int* classStarts = static_cast<const int*>(classStartsPtr);
+    int* outFlagSamples = static_cast<int*>(outFlagSamplesPtr);
+
+    int N = blockIdx.x;
+    int blockOffset = N * samples;
+    int validSamples = validSampleCount[N];
+
+    if (threadIdx.x == 0)
+    {
+        smemClasses.done = 0;
+    }
+
+    BBox curBox[ItemsPerThreads];
+    int label[ItemsPerThreads];
+#pragma unroll
+    for (int ite = 0; ite * blockDim.x < validSamples; ++ite)
+    {
+        int curIdx = ite * blockDim.x + threadIdx.x;
+        if (curIdx < validSamples)
+        {
+            label[ite] = (int) inLabel[blockOffset + curIdx];
+            curBox[ite] = readBbox(inBbox, blockOffset + inBboxRefIdx[blockOffset + curIdx]);
+        }
+        else
+        {
+            label[ite] = -1;
+        }
+        smemClasses.markSamples[curIdx] = (label[ite] < 0 ? false : true);
+    }
+
+    int classBlockOffset = N * (NClass + 1);
+    for (int i = threadIdx.x; i < NClass + 1; i += blockDim.x)
+    {
+        int refIdx = classStarts[classBlockOffset + i];
+        smemClasses.refIdx[i] = refIdx;
+        smemClasses.refBox[i] = readBbox(inBbox, blockOffset + inBboxRefIdx[blockOffset + refIdx]);
+    }
+    __syncthreads();
+
+    for (int i = threadIdx.x; i < NClass; i += blockDim.x)
+    {
+        int endIdx = smemClasses.refIdx[i + 1];
+        smemClasses.endIdx[i] = endIdx;
+        if (endIdx == smemClasses.refIdx[i])
+        {
+            atomicAdd(&smemClasses.done, 1);
+        }
+    }
+    __syncthreads();
+
+#if DUBUG_KERNEL
+    // print info
+    if (N == DUBUG_BATCH && threadIdx.x == 0)
+    {
+        printf("batch:%d, before starting NMS, done count:%d\n", N, smemClasses.done);
+        printf("batch:%d, Total num:%d, startPos:\n", N, validSamples);
+        for (int k = 0; k < NClass; ++k)
+        {
+            if (smemClasses.refIdx[k] != smemClasses.endIdx[k])
+            {
+                printf("Batch:%d, label:%d [%d : %d], check ref-label:%d\n", N, k, smemClasses.refIdx[k],
+                    smemClasses.endIdx[k], (int) inLabel[blockOffset + smemClasses.refIdx[k]]);
+            }
+        }
+        printf("\n");
+    }
+    __syncthreads();
+#endif
+
+    // class done to check stop point
+    while (smemClasses.done < NClass)
+    {
+
+        for (int ite = 0; ite * blockDim.x < validSamples; ++ite)
+        {
+            int curIdx = ite * blockDim.x + threadIdx.x;
+            int refIdx = -1;
+            int endIdx = -1;
+            if (curIdx < validSamples && smemClasses.markSamples[curIdx])
+            {
+                if (label[ite] >= 0)
+                {
+                    refIdx = smemClasses.refIdx[label[ite]];
+                    endIdx = smemClasses.endIdx[label[ite]];
+                    if (curIdx > refIdx && curIdx < endIdx)
+                    {
+                        BBox refBox = smemClasses.refBox[label[ite]];
+                        if (boxIoU(refBox, curBox[ite]) >= (DType) nmsThreshold)
+                        {
+                            smemClasses.markSamples[curIdx] = false;
+                        }
+                    }
+                }
+            }
+        }
+        __syncthreads();
+
+        // push refIdx/refBox forward to next mark
+        // only the refIdx thread to push itself. other threads idle
+        for (int i = threadIdx.x; i < NClass; i += blockDim.x)
+        {
+            int refIdx = smemClasses.refIdx[i];
+            int endIdx = smemClasses.endIdx[i];
+            if (refIdx < endIdx)
+            {
+                do
+                {
+                    ++refIdx;
+                } while (refIdx < endIdx && smemClasses.markSamples[refIdx] == false);
+                smemClasses.refIdx[i] = refIdx;
+                if (refIdx < endIdx)
+                {
+                    smemClasses.refBox[i] = readBbox(inBbox, blockOffset + inBboxRefIdx[blockOffset + refIdx]);
+                }
+                else
+                {
+                    atomicAdd(&smemClasses.done, 1);
+                }
+            }
+        }
+        __syncthreads();
+    }
+
+    // no need to write all data out
+    for (int segment = 0; segment < validSamples; segment += blockDim.x)
+    {
+        int curIdx = segment + threadIdx.x;
+        if (curIdx < validSamples)
+        {
+            outFlagSamples[blockOffset + curIdx] = (smemClasses.markSamples[curIdx] ? 1 : 0);
+        }
+    }
+}
+
+// TopKGather
+// gridDim.x : batch-N
+// blockDim.x : Threads
+// ItemsPerThreads : = divUp(samples, Threads)
+// outDetectionCount : int [N], must be set 0 before kernel
+#define MaxItemsPerThreads 8
+template <typename DType, typename BoxType, int Threads = 256>
+__global__ void TopKGatherProposal_kernel(int samples, int keepTopK, const void* validSampleCountPtr,
+    const void* inScorePtr, const void* inLabelPtr, const void* inBboxPtr, const void* inBboxRefIdxPtr,
+    const void* inFlagSamplesPtr, void* outBboxPtr)
+{
+    typedef BBoxT<BoxType> BBox;
+    typedef cub::BlockRadixSort<DType, Threads, 1, int> BlockRadixSort1;
+    typedef cub::BlockRadixSort<DType, Threads, 2, int> BlockRadixSort2;
+    typedef cub::BlockRadixSort<DType, Threads, 3, int> BlockRadixSort3;
+    typedef cub::BlockRadixSort<DType, Threads, 4, int> BlockRadixSort4;
+    typedef cub::BlockRadixSort<DType, Threads, 5, int> BlockRadixSort5;
+    typedef cub::BlockRadixSort<DType, Threads, 6, int> BlockRadixSort6;
+    typedef cub::BlockRadixSort<DType, Threads, 7, int> BlockRadixSort7;
+    typedef cub::BlockRadixSort<DType, Threads, 8, int> BlockRadixSort8;
+    __shared__ union
+    {
+        typename BlockRadixSort8::TempStorage sort8;
+        typename BlockRadixSort7::TempStorage sort7;
+        typename BlockRadixSort6::TempStorage sort6;
+        typename BlockRadixSort5::TempStorage sort5;
+        typename BlockRadixSort4::TempStorage sort4;
+        typename BlockRadixSort3::TempStorage sort3;
+        typename BlockRadixSort2::TempStorage sort2;
+        typename BlockRadixSort1::TempStorage sort1;
+    } temp_storage;
+    assert(MaxItemsPerThreads * Threads >= samples);
+
+    const int* validSampleCount = static_cast<const int*>(validSampleCountPtr);
+    const DType* inScore = static_cast<const DType*>(inScorePtr);
+    const BBox* inBbox = static_cast<const BBox*>(inBboxPtr);
+    const int* inBboxRefIdx = static_cast<const int*>(inBboxRefIdxPtr);
+    const int* inFlagSamples = static_cast<const int*>(inFlagSamplesPtr);
+    BBox* outBbox = static_cast<BBox*>(outBboxPtr);
+
+    int N = blockIdx.x;
+    int blockOffset = N * samples;
+    int validSamples = validSampleCount[N];
+    int finalTopK = dMIN(keepTopK, validSamples);
+
+    int idx[MaxItemsPerThreads];
+    DType score[MaxItemsPerThreads];
+    int totalItems = (validSamples + (blockDim.x - 1)) / blockDim.x;
+
+    for (int ite = 0; ite < totalItems; ++ite)
+    {
+        int curIdx = ite * blockDim.x + threadIdx.x;
+        if (curIdx < validSamples && inFlagSamples[blockOffset + curIdx])
+        {
+            idx[ite] = curIdx;
+            score[ite] = inScore[blockOffset + curIdx];
+        }
+        else
+        {
+            idx[ite] = -1;
+            score[ite] = 0.0f;
+        }
+    }
+
+    switch (totalItems)
+    {
+    case 0: break;
+    case 1:
+        BlockRadixSort1(temp_storage.sort1).SortDescendingBlockedToStriped((DType(&)[1]) score, (int(&)[1]) idx);
+        break;
+    case 2:
+        BlockRadixSort2(temp_storage.sort2).SortDescendingBlockedToStriped((DType(&)[2]) score, (int(&)[2]) idx);
+        break;
+    case 3:
+        BlockRadixSort3(temp_storage.sort3).SortDescendingBlockedToStriped((DType(&)[3]) score, (int(&)[3]) idx);
+        break;
+    case 4:
+        BlockRadixSort4(temp_storage.sort4).SortDescendingBlockedToStriped((DType(&)[4]) score, (int(&)[4]) idx);
+        break;
+    case 5:
+        BlockRadixSort5(temp_storage.sort5).SortDescendingBlockedToStriped((DType(&)[5]) score, (int(&)[5]) idx);
+        break;
+    case 6:
+        BlockRadixSort6(temp_storage.sort6).SortDescendingBlockedToStriped((DType(&)[6]) score, (int(&)[6]) idx);
+        break;
+    case 7:
+        BlockRadixSort7(temp_storage.sort7).SortDescendingBlockedToStriped((DType(&)[7]) score, (int(&)[7]) idx);
+        break;
+    case 8:
+        BlockRadixSort8(temp_storage.sort8).SortDescendingBlockedToStriped((DType(&)[8]) score, (int(&)[8]) idx);
+        break;
+    default: assert(false);
+    }
+    __syncthreads();
+
+    int outBlockOffset = N * keepTopK;
+    int topkItems = (keepTopK + (Threads - 1)) / Threads;
+    for (int i = 0; i < topkItems; ++i)
+    {
+        int curI = i * blockDim.x + threadIdx.x;
+        if (curI < keepTopK)
+        {
+            BBox oB = {(BoxType) 0.0f, (BoxType) 0.0f, (BoxType) 0.0f, (BoxType) 0.0f};
+            if (curI < finalTopK && idx[i] >= 0 && score[i] > MinValidScore)
+            {
+                oB = ((BBox*) inBbox)[blockOffset + inBboxRefIdx[blockOffset + idx[i]]];
+            }
+            ((BBox*) outBbox)[outBlockOffset + curI] = oB;
+        }
+    }
+}
+
+#define MaxItemsPerThreads 8
+template <typename DType, typename BoxType, int Threads = 256>
+__global__ void TopKGather_kernel(int samples, int keepTopK, const void* validSampleCountPtr, const void* inScorePtr,
+    const void* inLabelPtr, const void* inBboxPtr, const void* inBboxRefIdxPtr, const void* inFlagSamplesPtr,
+    void* outDetectionPtr)
+{
+    typedef BBoxT<BoxType> BBox;
+    typedef cub::BlockRadixSort<DType, Threads, 1, int> BlockRadixSort1;
+    typedef cub::BlockRadixSort<DType, Threads, 2, int> BlockRadixSort2;
+    typedef cub::BlockRadixSort<DType, Threads, 3, int> BlockRadixSort3;
+    typedef cub::BlockRadixSort<DType, Threads, 4, int> BlockRadixSort4;
+    typedef cub::BlockRadixSort<DType, Threads, 5, int> BlockRadixSort5;
+    typedef cub::BlockRadixSort<DType, Threads, 6, int> BlockRadixSort6;
+    typedef cub::BlockRadixSort<DType, Threads, 7, int> BlockRadixSort7;
+    typedef cub::BlockRadixSort<DType, Threads, 8, int> BlockRadixSort8;
+    __shared__ union
+    {
+        typename BlockRadixSort8::TempStorage sort8;
+        typename BlockRadixSort7::TempStorage sort7;
+        typename BlockRadixSort6::TempStorage sort6;
+        typename BlockRadixSort5::TempStorage sort5;
+        typename BlockRadixSort4::TempStorage sort4;
+        typename BlockRadixSort3::TempStorage sort3;
+        typename BlockRadixSort2::TempStorage sort2;
+        typename BlockRadixSort1::TempStorage sort1;
+    } temp_storage;
+    assert(MaxItemsPerThreads * Threads >= samples);
+
+    const int* validSampleCount = static_cast<const int*>(validSampleCountPtr);
+    const DType* inScore = static_cast<const DType*>(inScorePtr);
+    const BoxType* inLabel = static_cast<const BoxType*>(inLabelPtr); // InLabel keeps INT32
+    const BBox* inBbox = static_cast<const BBox*>(inBboxPtr);
+    const int* inBboxRefIdx = static_cast<const int*>(inBboxRefIdxPtr);
+    const int* inFlagSamples = static_cast<const int*>(inFlagSamplesPtr);
+    DType* outDetections = static_cast<DType*>(outDetectionPtr);
+
+    int N = blockIdx.x;
+    int blockOffset = N * samples;
+    int validSamples = validSampleCount[N];
+    int finalTopK = dMIN(keepTopK, validSamples);
+
+    int idx[MaxItemsPerThreads];
+    DType score[MaxItemsPerThreads];
+    int totalItems = (validSamples + (blockDim.x - 1)) / blockDim.x;
+
+    for (int ite = 0; ite < totalItems; ++ite)
+    {
+        int curIdx = ite * blockDim.x + threadIdx.x;
+        if (curIdx < validSamples && inFlagSamples[blockOffset + curIdx])
+        {
+            idx[ite] = curIdx;
+            score[ite] = inScore[blockOffset + curIdx];
+        }
+        else
+        {
+            idx[ite] = -1;
+            score[ite] = 0.0f;
+        }
+    }
+
+    switch (totalItems)
+    {
+    case 0: break;
+    case 1:
+        BlockRadixSort1(temp_storage.sort1).SortDescendingBlockedToStriped((DType(&)[1]) score, (int(&)[1]) idx);
+        break;
+    case 2:
+        BlockRadixSort2(temp_storage.sort2).SortDescendingBlockedToStriped((DType(&)[2]) score, (int(&)[2]) idx);
+        break;
+    case 3:
+        BlockRadixSort3(temp_storage.sort3).SortDescendingBlockedToStriped((DType(&)[3]) score, (int(&)[3]) idx);
+        break;
+    case 4:
+        BlockRadixSort4(temp_storage.sort4).SortDescendingBlockedToStriped((DType(&)[4]) score, (int(&)[4]) idx);
+        break;
+    case 5:
+        BlockRadixSort5(temp_storage.sort5).SortDescendingBlockedToStriped((DType(&)[5]) score, (int(&)[5]) idx);
+        break;
+    case 6:
+        BlockRadixSort6(temp_storage.sort6).SortDescendingBlockedToStriped((DType(&)[6]) score, (int(&)[6]) idx);
+        break;
+    case 7:
+        BlockRadixSort7(temp_storage.sort7).SortDescendingBlockedToStriped((DType(&)[7]) score, (int(&)[7]) idx);
+        break;
+    case 8:
+        BlockRadixSort8(temp_storage.sort8).SortDescendingBlockedToStriped((DType(&)[8]) score, (int(&)[8]) idx);
+        break;
+    default: assert(false);
+    }
+    __syncthreads();
+
+    int outBlockOffset = N * keepTopK;
+    int topkItems = (keepTopK + (Threads - 1)) / Threads;
+    for (int i = 0; i < topkItems; ++i)
+    {
+        int curI = i * blockDim.x + threadIdx.x;
+        if (curI < keepTopK)
+        {
+            BBox oB = {(BoxType) 0.0f, (BoxType) 0.0f, (BoxType) 0.0f, (BoxType) 0.0f};
+            DType oS = 0.0f;
+            BoxType oL = -1;
+            if (curI < finalTopK && idx[i] >= 0 && score[i] > MinValidScore)
+            {
+                oB = ((BBox*) inBbox)[blockOffset + inBboxRefIdx[blockOffset + idx[i]]];
+                oS = score[i];
+                oL = (BoxType) inLabel[blockOffset + idx[i]];
+            }
+            outDetections[(outBlockOffset + curI) * 6] = oB.y1;
+            outDetections[(outBlockOffset + curI) * 6 + 1] = oB.x1;
+            outDetections[(outBlockOffset + curI) * 6 + 2] = oB.y2;
+            outDetections[(outBlockOffset + curI) * 6 + 3] = oB.x2;
+            outDetections[(outBlockOffset + curI) * 6 + 4] = oL;
+            outDetections[(outBlockOffset + curI) * 6 + 5] = oS;
+        }
+    }
+}
+
+RefineDetectionWorkSpace::RefineDetectionWorkSpace(
+    const int batchSize, const int sampleCount, const RefineNMSParameters& param, const nvinfer1::DataType inType)
+    : argMaxScoreDims(sampleCount, 1)
+    , argMaxBboxDims(sampleCount, 4)
+    , argMaxLabelDims(sampleCount, 1)
+    , sortClassScoreDims(sampleCount, 1)
+    , sortClassLabelDims(sampleCount, 1)
+    , sortClassSampleIdxDims(sampleCount, 1)
+    , sortClassPosDims(param.numClasses + 1, 1)
+    , sortNMSMarkDims(sampleCount, 1)
+{
+    size_t sumSize = 0;
+
+    const nvinfer1::DataType type = nvinfer1::DataType::kFLOAT;
+
+    // resource
+    // arMaxScore : [N, samples] : m_Type
+    argMaxScoreOffset = sumSize;
+    sumSize += AlignMem(dimVolume(argMaxScoreDims) * typeSize(type) * batchSize);
+
+    argMaxBboxOffset = sumSize;
+    // argMaxBbox : [N, samples, 4] : m_Type
+    sumSize += AlignMem(dimVolume(argMaxBboxDims) * typeSize(type) * batchSize);
+
+    argMaxLabelOffset = sumSize;
+    // argMaxLabel : [N, samples] : kINT32
+    sumSize += AlignMem(dimVolume(argMaxLabelDims) * typeSize(nvinfer1::DataType::kINT32) * batchSize);
+
+    sortClassScoreOffset = sumSize;
+    // sortClassScore : [N, samples] : m_Type
+    sumSize += AlignMem(dimVolume(sortClassScoreDims) * typeSize(type) * batchSize);
+
+    sortClassLabelOffset = sumSize;
+    // sortClassLabel : [N, samples] : kINT32
+    sumSize += AlignMem(dimVolume(sortClassLabelDims) * typeSize(type) * batchSize);
+
+    sortClassSampleIdxOffset = sumSize;
+    // sortClassSampleIdx : [N, samples] : kINT32
+    sumSize += AlignMem(dimVolume(sortClassSampleIdxDims) * typeSize(nvinfer1::DataType::kINT32) * batchSize);
+
+    sortClassValidCountOffset = sumSize;
+    // sortClassValidCount : [N, 1] : kINT32
+    sumSize += AlignMem(dimVolume(sortClassValidCountDims) * typeSize(nvinfer1::DataType::kINT32) * batchSize);
+
+    sortClassPosOffset = sumSize;
+    // sortClassPos : [N, numClasses+1] : kINT32
+    sumSize += AlignMem(dimVolume(sortClassPosDims) * typeSize(nvinfer1::DataType::kINT32) * batchSize);
+
+    sortNMSMarkOffset = sumSize;
+    // sortNMSMark : [N, samples] : kINT32
+    sumSize += AlignMem(dimVolume(sortNMSMarkDims) * typeSize(nvinfer1::DataType::kINT32) * batchSize);
+
+    totalSize = sumSize;
+}
+
+ProposalWorkSpace::ProposalWorkSpace(const int batchSize, const int inputCnt, const int sampleCount,
+    const RefineNMSParameters& param, const nvinfer1::DataType inType)
+    : preRefineScoreDims(inputCnt, 1)
+    , preRefineSortedScoreDims(inputCnt, 1)
+    , preRefineBboxDims(inputCnt, 4)
+    , argMaxScoreDims(sampleCount, 1)
+    , argMaxBboxDims(sampleCount, 4)
+    , argMaxLabelDims(sampleCount, 1)
+    , sortClassScoreDims(sampleCount, 1)
+    , sortClassLabelDims(sampleCount, 1)
+    , sortClassSampleIdxDims(sampleCount, 1)
+    , sortClassPosDims(param.numClasses + 1, 1)
+    , sortNMSMarkDims(sampleCount, 1)
+{
+    size_t sumSize = 0;
+
+    const nvinfer1::DataType type = nvinfer1::DataType::kFLOAT;
+
+    // resource
+    // temp storage size for sorting scores
+    tempStorageOffset = sumSize;
+    sumSize += (1 << 23) * batchSize;
+
+    // preRefineScore : [N, inputcnt, 1] // extracted foreground score from inputs[0]
+    preRefineScoreOffset = sumSize;
+    sumSize += AlignMem(dimVolume(preRefineScoreDims) * typeSize(type) * batchSize);
+
+    // preRefineSortedScore: [N, inputcnt, 1]
+    preRefineSortedScoreOffset = sumSize;
+    sumSize += AlignMem(dimVolume(preRefineSortedScoreDims) * typeSize(type) * batchSize);
+
+    // preRefineBbox: [N, inputcnt, 4] // sorted bbox
+    preRefineBboxOffset = sumSize;
+    sumSize += AlignMem(dimVolume(preRefineBboxDims) * typeSize(type) * batchSize);
+
+    // arMaxScore : [N, samples] : m_Type
+    argMaxScoreOffset = sumSize;
+    sumSize += AlignMem(dimVolume(argMaxScoreDims) * typeSize(type) * batchSize);
+
+    argMaxBboxOffset = sumSize;
+    // argMaxBbox : [N, samples, 4] : m_Type
+    sumSize += AlignMem(dimVolume(argMaxBboxDims) * typeSize(type) * batchSize);
+
+    argMaxLabelOffset = sumSize;
+    // argMaxLabel : [N, samples] : kINT32
+    sumSize += AlignMem(dimVolume(argMaxLabelDims) * typeSize(nvinfer1::DataType::kINT32) * batchSize);
+
+    sortClassScoreOffset = sumSize;
+    // sortClassScore : [N, samples] : m_Type
+    sumSize += AlignMem(dimVolume(sortClassScoreDims) * typeSize(type) * batchSize);
+
+    sortClassLabelOffset = sumSize;
+    // sortClassLabel : [N, samples] : kINT32
+    sumSize += AlignMem(dimVolume(sortClassLabelDims) * typeSize(type) * batchSize);
+
+    sortClassSampleIdxOffset = sumSize;
+    // sortClassSampleIdx : [N, samples] : kINT32
+    sumSize += AlignMem(dimVolume(sortClassSampleIdxDims) * typeSize(nvinfer1::DataType::kINT32) * batchSize);
+
+    sortClassValidCountOffset = sumSize;
+    // sortClassValidCount : [N, 1] : kINT32
+    sumSize += AlignMem(dimVolume(sortClassValidCountDims) * typeSize(nvinfer1::DataType::kINT32) * batchSize);
+
+    sortClassPosOffset = sumSize;
+    // sortClassPos : [N, numClasses+1] : kINT32
+    sumSize += AlignMem(dimVolume(sortClassPosDims) * typeSize(nvinfer1::DataType::kINT32) * batchSize);
+
+    sortNMSMarkOffset = sumSize;
+    // sortNMSMark : [N, samples] : kINT32
+    sumSize += AlignMem(dimVolume(sortNMSMarkDims) * typeSize(nvinfer1::DataType::kINT32) * batchSize);
+
+    totalSize = sumSize;
+}
+
+template <int Threads>
+cudaError_t argMaxGroup(cudaStream_t stream, int N, nvinfer1::DataType dtype, int samples, int NClass,
+    const void* inScore, const void* inBbox, const void* validSamples, void* outScore, void* outLabel, void* outBbox)
+{
+    int maxGridX = dMIN(samples, 512 / N);
+    dim3 gridDim = {(unsigned int) nAlignDown(maxGridX, 32), (unsigned int) N, 1};
+    dim3 threads = {Threads, 1, 1};
+    switch (dtype)
+    {
+    case nvinfer1::DataType::kFLOAT:
+        argMaxGroup_kernel<float, float, Threads><<<gridDim, threads, 0, stream>>>(
+            samples, NClass, inScore, inBbox, validSamples, outScore, outLabel, outBbox);
+        break;
+    case nvinfer1::DataType::kHALF: break;
+    default: assert(false);
+    }
+
+    return cudaGetLastError();
+}
+
+template <int Threads, int ItermPerThreads>
+cudaError_t sortPerClass(cudaStream_t stream, int N, nvinfer1::DataType dtype, int samples, int NClass, int background,
+    float scoreThreshold, const void* inSampleValidCount, const void* inScorePtr, const void* inLabelPtr,
+    const void* inBboxPtr, void* outclassStartPosPtr, void* outScorePtr, void* outLabelPtr, void* outSampleIdxPtr,
+    void* outValidSampleCountPtr)
+{
+    int blocks = N;
+    int threads = Threads;
+
+    switch (dtype)
+    {
+    case nvinfer1::DataType::kFLOAT:
+        sortPerClass_kernel<float, float, Threads, ItermPerThreads><<<blocks, threads, 0, stream>>>(samples, NClass,
+            background, scoreThreshold, inSampleValidCount, inScorePtr, inLabelPtr, inBboxPtr, outclassStartPosPtr,
+            outScorePtr, outLabelPtr, outSampleIdxPtr, outValidSampleCountPtr);
+        break;
+    case nvinfer1::DataType::kHALF: break;
+    default: assert(false);
+    }
+
+    return cudaGetLastError();
+};
+
+template <int Threads>
+cudaError_t PerClassNMS(cudaStream_t stream, int N, nvinfer1::DataType dtype, int samples, int NClass,
+    const float nmsThreshold, const void* validSampleCount,
+    // const void *inScore,
+    const void* inLabel, const void* inBbox, const void* inBboxRefIdx, const void* classStarts, void* outFlagSamples)
+{
+    int blocks = N;
+    int threads = Threads;
+
+    switch (dtype)
+    {
+    case nvinfer1::DataType::kFLOAT:
+        PerClassNMS_kernel<float, float, Threads><<<blocks, threads, 0, stream>>>(samples, NClass, nmsThreshold,
+            validSampleCount, inLabel, inBbox, inBboxRefIdx, classStarts, outFlagSamples);
+        break;
+    case nvinfer1::DataType::kHALF: break;
+    default: assert(false);
+    }
+
+    return cudaGetLastError();
+}
+
+template <int Threads>
+cudaError_t KeepTopKGather(cudaStream_t stream, int N, nvinfer1::DataType dtype, int samples, int keepTopK,
+    const void* validSampleCountPtr, const void* inScorePtr, const void* inLabelPtr, const void* inBboxPtr,
+    const void* inBboxRefIdxPtr, const void* inFlagSamplesPtr, void* outDetections, int proposal)
+{
+    int blocks = N;
+    int threads = Threads;
+
+    switch (dtype)
+    {
+    case nvinfer1::DataType::kFLOAT:
+        if (proposal)
+        {
+            TopKGatherProposal_kernel<float, float, Threads><<<blocks, threads, 0, stream>>>(samples, keepTopK,
+                validSampleCountPtr, inScorePtr, inLabelPtr, inBboxPtr, inBboxRefIdxPtr, inFlagSamplesPtr,
+                outDetections);
+        }
+        else
+        {
+            TopKGather_kernel<float, float, Threads><<<blocks, threads, 0, stream>>>(samples, keepTopK,
+                validSampleCountPtr, inScorePtr, inLabelPtr, inBboxPtr, inBboxRefIdxPtr, inFlagSamplesPtr,
+                outDetections);
+        }
+        break;
+    case nvinfer1::DataType::kHALF: break;
+    default: assert(false);
+    }
+
+    return cudaGetLastError();
+}
+
+cudaError_t RefineBatchClassNMS(cudaStream_t stream, int N, int samples, nvinfer1::DataType dtype,
+    const RefineNMSParameters& param, const RefineDetectionWorkSpace& refineOffset, void* workspace,
+    const void* inScores, const void* inDelta, const void* inCountValid, const void* inROI, void* outDetections)
+{
+    int NClass = param.numClasses;
+    int8_t* wsPtr = static_cast<int8_t*>(workspace);
+    void* argMaxScorePtr = wsPtr + refineOffset.argMaxScoreOffset;
+    void* argMaxLabelPtr = wsPtr + refineOffset.argMaxLabelOffset;
+    void* argMaxBBoxPtr = wsPtr + refineOffset.argMaxBboxOffset;
+
+    void* sortClassScorePtr = wsPtr + refineOffset.sortClassScoreOffset;
+    void* sortClassLabelPtr = wsPtr + refineOffset.sortClassLabelOffset;
+    void* sortClassSampleIdxPtr = wsPtr + refineOffset.sortClassSampleIdxOffset;
+    void* sortClassValidCountPtr = wsPtr + refineOffset.sortClassValidCountOffset;
+    void* sortClassPosPtr = wsPtr + refineOffset.sortClassPosOffset;
+    void* sortNMSMarkPtr = wsPtr + refineOffset.sortNMSMarkOffset;
+
+    cudaError_t status = cudaSuccess;
+    CUASSERT(cudaMemsetAsync(sortClassValidCountPtr, 0, N * sizeof(int), stream));
+
+    if (NClass > 1)
+    { // multiple classes
+        status = argMaxGroup<32>(stream, N, dtype, samples, NClass, inScores, inDelta, inCountValid, argMaxScorePtr,
+            argMaxLabelPtr, argMaxBBoxPtr); // argMaxBBoxPtr means delta of bboxes
+        assert(status == cudaSuccess);
+        CUASSERT(status);
+    }
+    else
+    { // Only one class
+        argMaxScorePtr = const_cast<void*>(inScores);
+        argMaxBBoxPtr = const_cast<void*>(inDelta);
+        int threads = 512;
+        int blocks = (N * samples + threads - 1) / threads;
+        blocks = dMIN(blocks, 8);
+        switch (dtype)
+        {
+        case nvinfer1::DataType::kFLOAT:
+        {
+            resetMemValue_kernel<float><<<blocks, threads, 0, stream>>>(argMaxLabelPtr, N * samples, 0);
+            break;
+        }
+        case nvinfer1::DataType::kHALF: { break;
+        }
+        default: assert(false);
+        }
+    }
+
+    status = ApplyDelta2Bboxes(stream, N, samples, inROI, argMaxBBoxPtr, argMaxBBoxPtr);
+    assert(status == cudaSuccess);
+
+    if (samples <= 1024)
+    {
+        status = sortPerClass<256, 4>(stream, N, dtype, samples, NClass, param.backgroundLabelId, param.scoreThreshold,
+            inCountValid, argMaxScorePtr, argMaxLabelPtr, argMaxBBoxPtr, sortClassPosPtr, sortClassScorePtr,
+            sortClassLabelPtr, sortClassSampleIdxPtr, sortClassValidCountPtr);
+    }
+    else if (samples <= 2048)
+    {
+        status = sortPerClass<256, 8>(stream, N, dtype, samples, NClass, param.backgroundLabelId, param.scoreThreshold,
+            inCountValid, argMaxScorePtr, argMaxLabelPtr, argMaxBBoxPtr, sortClassPosPtr, sortClassScorePtr,
+            sortClassLabelPtr, sortClassSampleIdxPtr, sortClassValidCountPtr);
+    }
+    else if (samples <= 4096)
+    {
+        status = sortPerClass<256, 16>(stream, N, dtype, samples, NClass, param.backgroundLabelId, param.scoreThreshold,
+            inCountValid, argMaxScorePtr, argMaxLabelPtr, argMaxBBoxPtr, sortClassPosPtr, sortClassScorePtr,
+            sortClassLabelPtr, sortClassSampleIdxPtr, sortClassValidCountPtr);
+    }
+    else
+    {
+        assert(false && "unsupported sortPerClass");
+        return cudaErrorLaunchFailure;
+    }
+    assert(status == cudaSuccess);
+    CUASSERT(status);
+
+    status = PerClassNMS<256>(stream, N, dtype, samples, NClass, param.iouThreshold, sortClassValidCountPtr,
+        // sortClassScorePtr,
+        sortClassLabelPtr, argMaxBBoxPtr, sortClassSampleIdxPtr, sortClassPosPtr, sortNMSMarkPtr);
+    assert(status == cudaSuccess);
+    CUASSERT(status);
+
+    status = KeepTopKGather<256>(stream, N, dtype, samples, param.keepTopK, sortClassValidCountPtr, sortClassScorePtr,
+        sortClassLabelPtr, argMaxBBoxPtr, sortClassSampleIdxPtr, sortNMSMarkPtr, outDetections, 0);
+    assert(status == cudaSuccess);
+    CUASSERT(status);
+    return status;
+}
+
+struct BF_SCORE
+{
+    float bg, fg;
+};
+// in_scores : [N, samples, 2]
+// output_score : [N, samples, 1]
+__global__ void extract_fg_kernel(int samples, const void* in_scores, void* output_score)
+{
+    const BF_SCORE* in = static_cast<const BF_SCORE*>(in_scores);
+    float* out = static_cast<float*>(output_score);
+
+    int N = blockIdx.x;
+    int blockOffset = N * samples;
+    int totalItems = (samples + (blockDim.x - 1)) / blockDim.x;
+
+    for (int i = 0; i < totalItems; i++)
+    {
+        int cur_id = i * blockDim.x + threadIdx.x;
+        out[blockOffset + cur_id] = in[blockOffset + cur_id].fg;
+    }
+}
+__global__ void set_offset_kernel(int stride, int size, int* output)
+{
+    // One block, because batch size shouldn't be too large.
+    for (int i = threadIdx.x; i < size; i += blockDim.x)
+    {
+        output[i] = i * stride;
+    }
+}
+
+__global__ void resample_kernel(int orig_size, int sample_size, const void* orig_score_ptr, const void* orig_bbox_ptr,
+    void* sampled_score_ptr, void* sampled_bbox_ptr)
+{
+    const float* in_score = static_cast<const float*>(orig_score_ptr);
+    const BBoxT<float>* in_bbox = static_cast<const BBoxT<float>*>(orig_bbox_ptr);
+    float* out_score = static_cast<float*>(sampled_score_ptr);
+    BBoxT<float>* out_bbox = static_cast<BBoxT<float>*>(sampled_bbox_ptr);
+
+    int N = blockIdx.x;
+    int blockOffset_in = N * orig_size;
+    int blockOffset_out = N * sample_size;
+    int totalItems = (sample_size + (blockDim.x - 1)) / blockDim.x;
+
+    for (int i = 0; i < totalItems; i++)
+    {
+        int cur_id = i * blockDim.x + threadIdx.x;
+        out_score[blockOffset_out + cur_id] = in_score[blockOffset_in + cur_id];
+        out_bbox[blockOffset_out + cur_id] = in_bbox[blockOffset_in + cur_id];
+    }
+}
+
+cudaError_t proposalRefineBatchClassNMS(cudaStream_t stream, int N, int inputCnt, int samples, nvinfer1::DataType dtype,
+    const RefineNMSParameters& param, const ProposalWorkSpace& proposalOffset, void* workspace,
+    const void* inScores, //[N, inputcnt, 2]
+    const void* inDelta,  //[N, inputcnt, 4]
+    const void* inCountValid,
+    const void* inAnchors, //[N, inputcnt, 4]
+    void* outProposals)
+{
+    int8_t* wsPtr = static_cast<int8_t*>(workspace);
+    void* tempStoragePtr = wsPtr + proposalOffset.tempStorageOffset;
+    void* preRefineScorePtr = wsPtr + proposalOffset.preRefineScoreOffset;
+    void* preRefineSortedScorePtr = wsPtr + proposalOffset.preRefineSortedScoreOffset;
+    void* preRefineBboxPtr = wsPtr + proposalOffset.preRefineBboxOffset;
+
+    void* argMaxScorePtr = wsPtr + proposalOffset.argMaxScoreOffset;
+    void* argMaxLabelPtr = wsPtr + proposalOffset.argMaxLabelOffset;
+    void* argMaxBBoxPtr = wsPtr + proposalOffset.argMaxBboxOffset;
+
+    void* sortClassScorePtr = wsPtr + proposalOffset.sortClassScoreOffset;
+    void* sortClassLabelPtr = wsPtr + proposalOffset.sortClassLabelOffset;
+    void* sortClassSampleIdxPtr = wsPtr + proposalOffset.sortClassSampleIdxOffset;
+    void* sortClassValidCountPtr = wsPtr + proposalOffset.sortClassValidCountOffset;
+    void* sortClassPosPtr = wsPtr + proposalOffset.sortClassPosOffset;
+    void* sortNMSMarkPtr = wsPtr + proposalOffset.sortNMSMarkOffset;
+
+    cudaError_t status = cudaSuccess;
+    CUASSERT(cudaMemsetAsync(sortClassValidCountPtr, 0, N * sizeof(int), stream));
+
+    // extract foreground score
+    extract_fg_kernel<<<N, dMIN(inputCnt, 1024), 0, stream>>>(inputCnt, inScores, preRefineScorePtr);
+    CUASSERT(cudaGetLastError());
+
+    // Here, inDelta are converted to normalize coordinates based on anchors
+    status = ApplyDelta2Bboxes(stream, N, inputCnt, inAnchors, inDelta, const_cast<void*>(inDelta));
+    CUASSERT(status);
+
+    // sort the score
+    // d_key_in: preRefineScorePtr [N, inputCnt, 1]
+    // d_key_out: preRefineSortedScorePtr
+    // d_values_in: inDelta [N, inputCnt, 4]
+    // d_values_out: preRefineBboxPtr
+    // num_items: inputCnt*N
+    // num_segments: N
+    // offsets: [0, inputCnt, inputCnt*2, ..., ]
+    int* offsets = static_cast<int*>(tempStoragePtr);
+    set_offset_kernel<<<1, 1024, 0, stream>>>(inputCnt, N + 1, offsets);
+    assert(cudaGetLastError() == cudaSuccess);
+    tempStoragePtr = static_cast<void*>(static_cast<int*>(tempStoragePtr) + (N + 1));
+
+    size_t temp_storage_bytes = 0;
+    cub::DeviceSegmentedRadixSort::SortPairsDescending(NULL, temp_storage_bytes, (float*) preRefineScorePtr,
+        (float*) preRefineSortedScorePtr, (BBoxT<float>*) inDelta, (BBoxT<float>*) preRefineBboxPtr, N * inputCnt, N,
+        offsets, offsets + 1, 0, 8 * sizeof(float), stream);
+
+    assert((1 << 23) * N > temp_storage_bytes);
+
+    cub::DeviceSegmentedRadixSort::SortPairsDescending(tempStoragePtr, temp_storage_bytes, (float*) preRefineScorePtr,
+        (float*) preRefineSortedScorePtr, (BBoxT<float>*) inDelta, (BBoxT<float>*) preRefineBboxPtr, N * inputCnt, N,
+        offsets, offsets + 1, 0, 8 * sizeof(float), stream);
+
+    int NClass = param.numClasses;
+    assert(NClass == 1);
+    if (NClass > 1)
+    { // multiple classes
+    }
+    else
+    { // Only one class
+        resample_kernel<<<N, dMIN(samples, 1024), 0, stream>>>(
+            inputCnt, samples, preRefineSortedScorePtr, preRefineBboxPtr, argMaxScorePtr, argMaxBBoxPtr);
+
+        int threads = 512;
+        int blocks = (N * samples + threads - 1) / threads;
+        blocks = dMIN(blocks, 8);
+        switch (dtype)
+        {
+        case nvinfer1::DataType::kFLOAT:
+        {
+            resetMemValue_kernel<float><<<blocks, threads, 0, stream>>>(argMaxLabelPtr, N * samples, 0);
+            break;
+        }
+        case nvinfer1::DataType::kHALF: { break;
+        }
+        default: assert(false);
+        }
+    }
+
+    if (samples <= 1024)
+    {
+        status = sortPerClass<256, 4>(stream, N, dtype, samples, NClass, param.backgroundLabelId, param.scoreThreshold,
+            inCountValid, argMaxScorePtr, argMaxLabelPtr, argMaxBBoxPtr, sortClassPosPtr, sortClassScorePtr,
+            sortClassLabelPtr, sortClassSampleIdxPtr, sortClassValidCountPtr);
+    }
+    else if (samples <= 2048)
+    {
+        status = sortPerClass<256, 8>(stream, N, dtype, samples, NClass, param.backgroundLabelId, param.scoreThreshold,
+            inCountValid, argMaxScorePtr, argMaxLabelPtr, argMaxBBoxPtr, sortClassPosPtr, sortClassScorePtr,
+            sortClassLabelPtr, sortClassSampleIdxPtr, sortClassValidCountPtr);
+    }
+    else if (samples <= 4096)
+    {
+        status = sortPerClass<256, 16>(stream, N, dtype, samples, NClass, param.backgroundLabelId, param.scoreThreshold,
+            inCountValid, argMaxScorePtr, argMaxLabelPtr, argMaxBBoxPtr, sortClassPosPtr, sortClassScorePtr,
+            sortClassLabelPtr, sortClassSampleIdxPtr, sortClassValidCountPtr);
+    }
+    else
+    {
+        assert(false && "unsupported sortPerClass");
+        return cudaErrorLaunchFailure;
+    }
+    assert(status == cudaSuccess);
+    CUASSERT(status);
+
+    status = PerClassNMS<256>(stream, N, dtype, samples, NClass, param.iouThreshold, sortClassValidCountPtr,
+        // sortClassScorePtr,
+        sortClassLabelPtr, argMaxBBoxPtr, sortClassSampleIdxPtr, sortClassPosPtr, sortNMSMarkPtr);
+    assert(status == cudaSuccess);
+    CUASSERT(status);
+
+    status = KeepTopKGather<256>(stream, N, dtype, samples, param.keepTopK, sortClassValidCountPtr, sortClassScorePtr,
+        sortClassLabelPtr, argMaxBBoxPtr, sortClassSampleIdxPtr, sortNMSMarkPtr, outProposals, 1);
+    assert(status == cudaSuccess);
+    CUASSERT(status);
+
+    return status;
+}
+
+struct BBOX
+{
+    float y1, x1, y2, x2;
+};
+
+struct DELTA
+{
+    float dy, dx, logdh, logdw;
+};
+
+__global__ void apply_delta_kernel(int samples, const void* anchors, const void* delta, void* outputBbox)
+{
+
+    const BBOX* anchors_in = static_cast<const BBOX*>(anchors);
+    const DELTA* delta_in = static_cast<const DELTA*>(delta);
+    BBOX* bbox_out = static_cast<BBOX*>(outputBbox);
+
+    int N = blockIdx.x;
+    int blockOffset = N * samples;
+    int totalItems = (samples + (blockDim.x - 1)) / blockDim.x;
+
+    for (int i = 0; i < totalItems; i++)
+    {
+        int cur_id = i * blockDim.x + threadIdx.x;
+
+        BBOX cur_anchor_yxyx = anchors_in[blockOffset + cur_id];
+        // convert yxyx -> cyxhw
+        // cy, cx, h, w
+        BBOX cur_anchor_cyxhw;
+
+        cur_anchor_cyxhw.y1 = (cur_anchor_yxyx.y1 + cur_anchor_yxyx.y2) / 2;
+        cur_anchor_cyxhw.x1 = (cur_anchor_yxyx.x1 + cur_anchor_yxyx.x2) / 2;
+        cur_anchor_cyxhw.y2 = (cur_anchor_yxyx.y2 - cur_anchor_yxyx.y1);
+        cur_anchor_cyxhw.x2 = (cur_anchor_yxyx.x2 - cur_anchor_yxyx.x1);
+
+        DELTA cur_delta = delta_in[blockOffset + cur_id];
+
+        // multiply std_dev
+        cur_delta.dy *= 0.1;
+        cur_delta.dx *= 0.1;
+        cur_delta.logdh *= 0.2;
+        cur_delta.logdw *= 0.2;
+
+        // apply delta
+        cur_anchor_cyxhw.y1 += cur_delta.dy * cur_anchor_cyxhw.y2;
+        cur_anchor_cyxhw.x1 += cur_delta.dx * cur_anchor_cyxhw.x2;
+        cur_anchor_cyxhw.y2 *= expf(cur_delta.logdh);
+        cur_anchor_cyxhw.x2 *= expf(cur_delta.logdw);
+
+        cur_anchor_yxyx.y1 = cur_anchor_cyxhw.y1 - 0.5 * cur_anchor_cyxhw.y2;
+        cur_anchor_yxyx.x1 = cur_anchor_cyxhw.x1 - 0.5 * cur_anchor_cyxhw.x2;
+        cur_anchor_yxyx.y2 = cur_anchor_yxyx.y1 + cur_anchor_cyxhw.y2;
+        cur_anchor_yxyx.x2 = cur_anchor_yxyx.x1 + cur_anchor_cyxhw.x2;
+
+        // clip bbox: a more precision clip method based on real window could be implemented
+        cur_anchor_yxyx.y1 = dMAX(dMIN(cur_anchor_yxyx.y1, 1.0), 0.0);
+        cur_anchor_yxyx.x1 = dMAX(dMIN(cur_anchor_yxyx.x1, 1.0), 0.0);
+        cur_anchor_yxyx.y2 = dMAX(dMIN(cur_anchor_yxyx.y2, 1.0), 0.0);
+        cur_anchor_yxyx.x2 = dMAX(dMIN(cur_anchor_yxyx.x2, 1.0), 0.0);
+
+        bbox_out[blockOffset + cur_id].y1 = cur_anchor_yxyx.y1;
+        bbox_out[blockOffset + cur_id].x1 = cur_anchor_yxyx.x1;
+        bbox_out[blockOffset + cur_id].y2 = cur_anchor_yxyx.y2;
+        bbox_out[blockOffset + cur_id].x2 = cur_anchor_yxyx.x2;
+    }
+}
+
+cudaError_t ApplyDelta2Bboxes(cudaStream_t stream, int N,
+    int samples,         // number of anchors per image
+    const void* anchors, // [N, anchors, (y1, x1, y2, x2)]
+    const void* delta,   //[N, anchors, (dy, dx, log(dh), log(dw)])
+    void* outputBbox     //[N, anchors, (y1, x1, y2, x2)]
+    )
+{
+
+    int blocks = N;
+    int threads = dMIN(samples, 1024);
+
+    // delta multiply bbox_std
+    // apply delta steps:
+    //  cy = anchor_cy + dy*height
+    //  cx = anchor_cx + dx*weight
+    //  h = exp(dh)*anchor_h
+    //  w = exp(dw)*anchor_w
+    // clip the bbox
+
+    apply_delta_kernel<<<blocks, threads, 0, stream>>>(samples, anchors, delta, outputBbox);
+
+    return cudaGetLastError();
+}
+
+template <typename Tfeat>
+__device__ inline Tfeat interpolateBilinear(const Tfeat* src, xy_t srcDims, float y, float x)
+{
+    const int y0 = static_cast<int>(y);
+    const float yAlpha = y - static_cast<float>(y0);
+    const int x0 = static_cast<int>(x);
+    const float xAlpha = x - static_cast<float>(x0);
+
+    assert(y0 < srcDims.y);
+    assert(x0 < srcDims.x);
+
+    const int y1 = (yAlpha == 0) ? y0 : y0 + 1; // ceil
+    const int x1 = (xAlpha == 0) ? x0 : x0 + 1; // ceil
+
+    assert(y1 < srcDims.y);
+    assert(x1 < srcDims.x);
+
+    const float src00 = src[(y0) *srcDims.x + (x0)];
+    const float src01 = src[(y0) *srcDims.x + (x1)];
+    const float src10 = src[(y1) *srcDims.x + (x0)];
+    const float src11 = src[(y1) *srcDims.x + (x1)];
+
+    const float src0 = src00 * (1 - xAlpha) + src01 * xAlpha;
+    const float src1 = src10 * (1 - xAlpha) + src11 * xAlpha;
+
+    return src0 * (1 - yAlpha) + src1 * yAlpha;
+}
+
+template <typename Trois, typename Tfeat>
+__global__ void roiAlign_kernel(int featureCount, int roiCount,
+
+    float threshold, const Trois* rois,
+
+    const Tfeat* P2, const xy_t P2dims, const Tfeat* P3, const xy_t P3dims, const Tfeat* P4, const xy_t P4dims,
+    const Tfeat* P5, const xy_t P5dims,
+
+    Tfeat* pooled, const xy_t poolDims)
+{
+    const int batch = blockIdx.x;
+    const int feature = blockIdx.y;
+
+    // int prev_invalid = -1;
+    for (int roiIdx = threadIdx.x; roiIdx < roiCount; roiIdx += blockDim.x)
+    {
+        const Trois* roi = rois + 4 * (batch * roiCount + roiIdx);
+
+        const float y1 = roi[0];
+        const float x1 = roi[1];
+        const float y2 = roi[2];
+        const float x2 = roi[3];
+
+        if (!(0 <= y1 && y1 <= 1 && 0 <= x1 && x1 <= 1 && 0 <= y2 && y2 <= 1 && 0 <= x2 && x2 <= 1 && y1 < y2
+                && x1 < x2))
+        {
+
+            continue;
+        }
+        else
+        {
+        }
+
+        const float hw = (y2 - y1) * (x2 - x1);
+
+        const Tfeat* src = P2;
+        xy_t srcDims = P2dims;
+        int iP = 2;
+
+        if (hw > threshold)
+        {
+            src = P3;
+            srcDims = P3dims;
+            ++iP;
+        }
+        threshold *= 4;
+
+        if (hw > threshold)
+        {
+            src = P4;
+            srcDims = P4dims;
+            ++iP;
+        }
+        threshold *= 4;
+
+        if (hw > threshold)
+        {
+            src = P5;
+            srcDims = P5dims;
+            ++iP;
+        }
+
+        src += srcDims.x * srcDims.y * (batch * featureCount + feature);
+
+        Tfeat* dst
+            = pooled + poolDims.x * poolDims.y * (batch * roiCount * featureCount + roiIdx * featureCount + feature);
+
+        const float yStart = y1 * (srcDims.y - 1);
+        const float xStart = x1 * (srcDims.x - 1);
+
+        const float yEnd = y2 * (srcDims.y - 1);
+        const float xEnd = x2 * (srcDims.x - 1);
+
+        const float yDelta = (yEnd - yStart) / (poolDims.y - 1);
+        const float xDelta = (xEnd - xStart) / (poolDims.x - 1);
+
+        for (int yy = 0; yy < poolDims.y; ++yy)
+        {
+            const float ySample = min(yStart + yDelta * yy, yEnd);
+
+            for (int xx = 0; xx < poolDims.x; ++xx)
+            {
+                const float xSample = min(xStart + xDelta * xx, xEnd);
+
+                float result = interpolateBilinear(src, srcDims, ySample, xSample);
+
+                *dst = result;
+                dst++;
+            }
+        }
+    }
+}
+
+cudaError_t roiAlign(cudaStream_t stream, int batchSize, int featureCount, int roiCount, float firstThreshold,
+
+    const void* rois, const void* const layers[], const xy_t* layerDims,
+
+    void* pooled, const xy_t poolDims)
+{
+    const dim3 blocks(batchSize, featureCount);
+    const int threads(256);
+
+    roiAlign_kernel<<<blocks, threads, 0, stream>>>(featureCount, roiCount, firstThreshold,
+        static_cast<const float*>(rois),
+
+        static_cast<const float*>(layers[0]), layerDims[0], static_cast<const float*>(layers[1]), layerDims[1],
+        static_cast<const float*>(layers[2]), layerDims[2], static_cast<const float*>(layers[3]), layerDims[3],
+
+        static_cast<float*>(pooled), poolDims);
+    return cudaGetLastError();
+}
+
+__global__ void resize_nearest_kernel_2d(int nbatch, float scale, int2 osize, float const* idata, int istride,
+    int ibatchstride, float* odata, int ostride, int obatchstride)
+{
+
+    int x0 = threadIdx.x + blockIdx.x * blockDim.x;
+    int y0 = threadIdx.y + blockIdx.y * blockDim.y;
+    int z0 = blockIdx.z;
+    for (int batch = z0; batch < nbatch; batch += gridDim.z)
+    {
+        for (int oy = y0; oy < osize.y; oy += blockDim.y * gridDim.y)
+        {
+            for (int ox = x0; ox < osize.x; ox += blockDim.x * gridDim.x)
+            {
+                int ix = int(ox / scale);
+                int iy = int(oy / scale);
+                odata[batch * obatchstride + oy * ostride + ox] = idata[batch * ibatchstride + iy * istride + ix];
+            }
+        }
+    }
+}
+
+void resizeNearest(dim3 grid, dim3 block, cudaStream_t stream, int nbatch, float scale, int2 osize, float const* idata,
+    int istride, int ibatchstride, float* odata, int ostride, int obatchstride)
+{
+
+    resize_nearest_kernel_2d<<<grid, block, 0, stream>>>(
+        nbatch, scale, osize, idata, istride, ibatchstride, odata, ostride, obatchstride);
+}
+
+struct BOX
+{
+    float y1, x1, y2, x2;
+};
+
+struct DETECTION
+{
+    float y1, x1, y2, x2, class_id, score;
+};
+
+__global__ void specialslice_kernel(int samples, const void* idata, void* odata)
+{
+
+    int N = blockIdx.x;
+    int blockOffset = N * samples;
+    int totalItems = (samples + (blockDim.x - 1)) / blockDim.x;
+    const DETECTION* in_detections = static_cast<const DETECTION*>(idata);
+    BOX* out_bboxes = static_cast<BOX*>(odata);
+
+    for (int i = 0; i < totalItems; i++)
+    {
+        int cur_id = i * blockDim.x + threadIdx.x;
+
+        out_bboxes[blockOffset + cur_id].y1 = in_detections[blockOffset + cur_id].y1;
+        out_bboxes[blockOffset + cur_id].x1 = in_detections[blockOffset + cur_id].x1;
+        out_bboxes[blockOffset + cur_id].y2 = in_detections[blockOffset + cur_id].y2;
+        out_bboxes[blockOffset + cur_id].x2 = in_detections[blockOffset + cur_id].x2;
+    }
+}
+
+void specialSlice(cudaStream_t stream, int batch_size, int boxes_cnt, const void* idata, void* odata)
+{
+    int blocks = batch_size;
+    int threads = dMIN(boxes_cnt, 2048);
+
+    specialslice_kernel<<<blocks, threads, 0, stream>>>(boxes_cnt, idata, odata);
+}
diff --git a/plugin/common/kernels/maskRCNNKernels.h b/plugin/common/kernels/maskRCNNKernels.h
new file mode 100644
index 00000000..4092ebb1
--- /dev/null
+++ b/plugin/common/kernels/maskRCNNKernels.h
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TRT_MASKRCNN_UTILS_H
+#define TRT_MASKRCNN_UTILS_H
+
+#include "NvInfer.h"
+#include "plugin.h"
+
+using namespace nvinfer1;
+
+inline size_t nAlignUp(size_t x, size_t align)
+{
+    size_t mask = align - 1;
+    assert((align & mask) == 0); // power of 2
+    return (x + mask) & (~mask);
+}
+
+inline size_t nAlignDown(size_t x, size_t align)
+{
+    size_t mask = align - 1;
+    assert((align & mask) == 0); // power of 2
+    return (x) & (~mask);
+}
+
+inline size_t dimVolume(const nvinfer1::Dims& dims)
+{
+    size_t volume = 1;
+    for (int i = 0; i < dims.nbDims; ++i)
+        volume *= dims.d[i];
+
+    return volume;
+}
+
+inline size_t typeSize(const nvinfer1::DataType type)
+{
+    switch (type)
+    {
+    case nvinfer1::DataType::kFLOAT: return sizeof(float);
+    case nvinfer1::DataType::kHALF: return sizeof(uint16_t);
+    case nvinfer1::DataType::kINT8: return sizeof(uint8_t);
+    case nvinfer1::DataType::kINT32: return sizeof(uint32_t);
+    default: return 0;
+    }
+}
+
+#define AlignMem(x) nAlignUp(x, 256)
+
+template <typename Dtype>
+struct CudaBind
+{
+    size_t mSize;
+    void* mPtr;
+
+    CudaBind(size_t size)
+    {
+        mSize = size;
+        CUASSERT(cudaMalloc(&mPtr, sizeof(Dtype) * mSize));
+    }
+
+    ~CudaBind()
+    {
+        if (mPtr != nullptr)
+        {
+            CUASSERT(cudaFree(mPtr));
+            mPtr = nullptr;
+        }
+    }
+};
+
+struct RefineNMSParameters
+{
+    int backgroundLabelId, numClasses, keepTopK;
+    float scoreThreshold, iouThreshold;
+};
+
+struct RefineDetectionWorkSpace
+{
+    RefineDetectionWorkSpace(
+        const int batchSize, const int sampleCount, const RefineNMSParameters& param, const nvinfer1::DataType type);
+
+    RefineDetectionWorkSpace() = default;
+
+    nvinfer1::DimsHW argMaxScoreDims;
+    nvinfer1::DimsHW argMaxBboxDims;
+    nvinfer1::DimsHW argMaxLabelDims;
+    nvinfer1::DimsHW sortClassScoreDims;
+    nvinfer1::DimsHW sortClassLabelDims;
+    nvinfer1::DimsHW sortClassSampleIdxDims;
+    nvinfer1::Dims sortClassValidCountDims = {1, {1, 0}, {nvinfer1::DimensionType::kINDEX}};
+    nvinfer1::DimsHW sortClassPosDims;
+    nvinfer1::DimsHW sortNMSMarkDims;
+
+    size_t argMaxScoreOffset = 0;
+    size_t argMaxBboxOffset = 0;
+    size_t argMaxLabelOffset = 0;
+    size_t sortClassScoreOffset = 0;
+    size_t sortClassLabelOffset = 0;
+    size_t sortClassSampleIdxOffset = 0;
+    size_t sortClassValidCountOffset = 0;
+    size_t sortClassPosOffset = 0;
+    size_t sortNMSMarkOffset = 0;
+    size_t totalSize = 0;
+};
+
+struct ProposalWorkSpace
+{
+    ProposalWorkSpace(const int batchSize, const int inputCnt, const int sampleCount, const RefineNMSParameters& param,
+        const nvinfer1::DataType type);
+
+    ProposalWorkSpace() = default;
+
+    nvinfer1::DimsHW preRefineScoreDims;
+    nvinfer1::DimsHW preRefineSortedScoreDims;
+    nvinfer1::DimsHW preRefineBboxDims;
+    nvinfer1::DimsHW argMaxScoreDims;
+    nvinfer1::DimsHW argMaxBboxDims;
+    nvinfer1::DimsHW argMaxLabelDims;
+    nvinfer1::DimsHW sortClassScoreDims;
+    nvinfer1::DimsHW sortClassLabelDims;
+    nvinfer1::DimsHW sortClassSampleIdxDims;
+    nvinfer1::Dims sortClassValidCountDims = {1, {1, 0}, {nvinfer1::DimensionType::kINDEX}};
+    nvinfer1::DimsHW sortClassPosDims;
+    nvinfer1::DimsHW sortNMSMarkDims;
+
+    size_t tempStorageOffset = 0;
+    size_t preRefineScoreOffset = 0;
+    size_t preRefineSortedScoreOffset = 0;
+    size_t preRefineBboxOffset = 0;
+    size_t argMaxScoreOffset = 0;
+    size_t argMaxBboxOffset = 0;
+    size_t argMaxLabelOffset = 0;
+    size_t sortClassScoreOffset = 0;
+    size_t sortClassLabelOffset = 0;
+    size_t sortClassSampleIdxOffset = 0;
+    size_t sortClassValidCountOffset = 0;
+    size_t sortClassPosOffset = 0;
+    size_t sortNMSMarkOffset = 0;
+    size_t totalSize = 0;
+};
+
+cudaError_t RefineBatchClassNMS(cudaStream_t stream, int N, int samples, nvinfer1::DataType dtype,
+    const RefineNMSParameters& param, const RefineDetectionWorkSpace& refineOffset, void* workspace,
+    const void* inScores, const void* inDelta, const void* inCountValid, const void* inROI, void* outDetections);
+
+cudaError_t proposalRefineBatchClassNMS(cudaStream_t stream, int N,
+    int inputCnt, // candidate anchors
+    int samples,  // preNMS_topK
+    nvinfer1::DataType dtype, const RefineNMSParameters& param, const ProposalWorkSpace& proposalOffset,
+    void* workspace, const void* inScores, const void* inDelta, const void* inCountValid, const void* inAnchors,
+    void* outProposals);
+
+cudaError_t ApplyDelta2Bboxes(cudaStream_t stream, int N,
+    int samples,         // number of anchors per image
+    const void* anchors, // [N, anchors, (y1, x1, y2, x2)]
+    const void* delta,   //[N, anchors, (dy, dx, log(dh), log(dw)]
+    void* outputBbox);
+
+struct xy_t
+{
+    int y;
+    int x;
+
+    xy_t()
+        : y(0)
+        , x(0)
+    {
+    }
+    xy_t(int y_, int x_)
+        : y(y_)
+        , x(x_)
+    {
+    }
+};
+// PYRAMID ROIALIGN
+cudaError_t roiAlign(cudaStream_t stream, int batchSize, int featureCount, int roiCount, float firstThreshold,
+
+    const void* rois, const void* const layers[], const xy_t* layerDims,
+
+    void* pooled, const xy_t poolDims);
+
+// RESIZE NEAREST
+void resizeNearest(dim3 grid, dim3 block, cudaStream_t stream, int nbatch, float scale, int2 osize, float const* idata,
+    int istride, int ibatchstride, float* odata, int ostride, int obatchstride);
+// SPECIAL SLICE
+void specialSlice(cudaStream_t stream, int batch_size, int boxes_cnt, const void* idata, void* odata);
+
+#endif // TRT_MASKRCNN_UTILS_H
diff --git a/plugin/common/serialize.hpp b/plugin/common/serialize.hpp
new file mode 100644
index 00000000..a98d57bb
--- /dev/null
+++ b/plugin/common/serialize.hpp
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cstring>
+#include <vector>
+#include <cassert>
+#include <type_traits>
+
+#include <iostream>
+using std::cout;
+using std::cerr;
+using std::endl;
+
+template<typename T>
+inline void serialize_value(void** buffer, T const& value);
+
+template<typename T>
+inline void deserialize_value(void const** buffer, size_t* buffer_size, T* value);
+
+namespace {
+
+template<typename T, class Enable=void>
+struct Serializer {};
+
+template<typename T>
+struct Serializer<T, typename std::enable_if<
+                       std::is_arithmetic<T>::value ||
+                       std::is_enum<T>::value ||
+                       std::is_pod<T>::value>::type> {
+  static size_t serialized_size(T const& value) {
+    return sizeof(T);
+  }
+  static void serialize(void** buffer, T const& value) {
+    ::memcpy(*buffer, &value, sizeof(T));
+    reinterpret_cast<char*&>(*buffer) += sizeof(T);
+  }
+  static void deserialize(void const** buffer, size_t* buffer_size, T* value) {
+    assert(*buffer_size >= sizeof(T));
+    ::memcpy(value, *buffer, sizeof(T));
+    reinterpret_cast<char const*&>(*buffer) += sizeof(T);
+    *buffer_size -= sizeof(T);
+  }
+};
+
+template<>
+struct Serializer<const char*> {
+  static size_t serialized_size(const char* value) {
+    return strlen(value) + 1;
+  }
+  static void serialize(void** buffer, const char* value) {
+    ::strcpy(static_cast<char*>(*buffer), value);
+    reinterpret_cast<char*&>(*buffer) += strlen(value) + 1;
+  }
+  static void deserialize(void const** buffer, size_t* buffer_size, const char** value) {
+    *value = static_cast<char const*>(*buffer);
+    size_t data_size = strnlen(*value, *buffer_size) + 1;
+    assert(*buffer_size >= data_size);
+    reinterpret_cast<char const*&>(*buffer) += data_size;
+    *buffer_size -= data_size;
+  }
+};
+
+template<typename T>
+struct Serializer<std::vector<T>, typename std::enable_if<
+                                    std::is_arithmetic<T>::value ||
+                                    std::is_enum<T>::value ||
+                                    std::is_pod<T>::value>::type> {
+  static size_t serialized_size(std::vector<T> const& value) {
+    return sizeof(value.size()) + value.size() * sizeof(T);
+  }
+  static void serialize(void** buffer, std::vector<T> const& value) {
+    serialize_value(buffer, value.size());
+    size_t nbyte = value.size() * sizeof(T);
+    ::memcpy(*buffer, value.data(), nbyte);
+    reinterpret_cast<char*&>(*buffer) += nbyte;
+  }
+  static void deserialize(void const** buffer, size_t* buffer_size, std::vector<T>* value) {
+    size_t size;
+    deserialize_value(buffer, buffer_size, &size);
+    value->resize(size);
+    size_t nbyte = value->size() * sizeof(T);
+    assert(*buffer_size >= nbyte);
+    ::memcpy(value->data(), *buffer, nbyte);
+    reinterpret_cast<char const*&>(*buffer) += nbyte;
+    *buffer_size -= nbyte;
+  }
+};
+
+} // namespace
+
+template<typename T>
+inline size_t serialized_size(T const& value) {
+  return Serializer<T>::serialized_size(value);
+}
+
+template<typename T>
+inline void serialize_value(void** buffer, T const& value) {
+  return Serializer<T>::serialize(buffer, value);
+}
+
+template<typename T>
+inline void deserialize_value(void const** buffer, size_t* buffer_size, T* value) {
+  return Serializer<T>::deserialize(buffer, buffer_size, value);
+}
diff --git a/plugin/cropAndResizePlugin/README.md b/plugin/cropAndResizePlugin/README.md
index 947b4402..e96b92ee 100644
--- a/plugin/cropAndResizePlugin/README.md
+++ b/plugin/cropAndResizePlugin/README.md
@@ -11,7 +11,7 @@
 
 ## Description
 
-The `cropAndResizePlugin` performs object detection for the Faster R-CNN model.
+The `cropAndResizePlugin` performs object detection for the Faster R-CNN model. This plugin is included in TensorRT and used in [sampleUffFasterRCNN] to perform inference.
 
 `cropAndResizePlugin` implements the TensorFlow style of ROIPooling(a.k.a. CropAndResize). It crops multiple region of interests(ROIs) from the input image with given ROI coordinates and then (bilinearly) resizes the cropped patches to a target spatial(width and height) size. 
 
@@ -77,4 +77,4 @@ This is the first release of this `README.md` file.
 
 ## Known issues
 
-There are no known issues in this plugin.
+There are no known issues in this plugin.
\ No newline at end of file
diff --git a/plugin/cropAndResizePlugin/cropAndResizePlugin.cpp b/plugin/cropAndResizePlugin/cropAndResizePlugin.cpp
index caccf6c8..4e0cb5d5 100644
--- a/plugin/cropAndResizePlugin/cropAndResizePlugin.cpp
+++ b/plugin/cropAndResizePlugin/cropAndResizePlugin.cpp
@@ -30,7 +30,7 @@ namespace
 {
 static const char* CROP_AND_RESIZE_PLUGIN_VERSION{"1"};
 static const char* CROP_AND_RESIZE_PLUGIN_NAME{"CropAndResize"};
-}
+} // namespace
 
 // Static class fields initialization
 PluginFieldCollection CropAndResizePluginCreator::mFC{};
@@ -93,9 +93,7 @@ CropAndResizePlugin::CropAndResizePlugin(const std::string name, int crop_width,
 {
 }
 
-CropAndResizePlugin::~CropAndResizePlugin()
-{
-}
+CropAndResizePlugin::~CropAndResizePlugin() {}
 
 const char* CropAndResizePlugin::getPluginType() const
 {
@@ -172,9 +170,7 @@ bool CropAndResizePlugin::supportsFormat(DataType type, PluginFormat format) con
     }
 }
 
-void CropAndResizePlugin::terminate()
-{
-}
+void CropAndResizePlugin::terminate() {}
 
 void CropAndResizePlugin::destroy()
 {
@@ -242,9 +238,7 @@ void CropAndResizePlugin::attachToContext(
 }
 
 // Detach the plugin object from its execution context.
-void CropAndResizePlugin::detachFromContext()
-{
-}
+void CropAndResizePlugin::detachFromContext() {}
 
 CropAndResizePluginCreator::CropAndResizePluginCreator()
 {
@@ -254,9 +248,7 @@ CropAndResizePluginCreator::CropAndResizePluginCreator()
     mFC.fields = mPluginAttributes.data();
 }
 
-CropAndResizePluginCreator::~CropAndResizePluginCreator()
-{
-}
+CropAndResizePluginCreator::~CropAndResizePluginCreator() {}
 
 const char* CropAndResizePluginCreator::getPluginName() const
 {
diff --git a/plugin/detectionLayerPlugin/CMakeLists.txt b/plugin/detectionLayerPlugin/CMakeLists.txt
new file mode 100644
index 00000000..1dbe788c
--- /dev/null
+++ b/plugin/detectionLayerPlugin/CMakeLists.txt
@@ -0,0 +1,18 @@
+#
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+file(GLOB SRCS *.cpp *.cu)
+set(PLUGIN_SOURCES ${PLUGIN_SOURCES} ${SRCS})
+set(PLUGIN_SOURCES ${PLUGIN_SOURCES} PARENT_SCOPE)
diff --git a/plugin/detectionLayerPlugin/README.md b/plugin/detectionLayerPlugin/README.md
new file mode 100644
index 00000000..60f3bbd9
--- /dev/null
+++ b/plugin/detectionLayerPlugin/README.md
@@ -0,0 +1,67 @@
+# DetectionLayer
+
+**Table Of Contents**
+- [Description](#description)
+    * [Structure](#structure)
+- [Parameters](#parameters)
+- [Additional resources](#additional-resources)
+- [License](#license)
+- [Changelog](#changelog)
+- [Known issues](#known-issues)
+
+## Description
+
+The `DetectionLayer` plugin performs bounding boxes refinement of MaskRCNN's detection head and generate the final detection output of MaskRCNN. It is used in sampleMaskRCNN.  
+
+
+### Structure
+
+This plugin supports the NCHW format. It takes three input tensors: `delta_bbox`, `score` and `roi`
+
+`delta_bbox` is the refinement information of roi boxes generated from `ProposalLayer`. `delta_bbox` tensor's shape is `[N, rois, num_classes*4, 1, 1]` where `N` is batch size,
+`rois` is the total number of ROI boxes candidates per image, and `num_classes*4` means 4 refinement elements (`[dy, dx, dh, dw]`) for each roi box as different classes.
+
+`score` is the predicted class scores of ROI boxes generated from `ProposalLayer` of shape `[N, rois, num_classes, 1, 1]`. There is `argmax`operation in `Detectionlayer` to determine the final class of detection
+candidates.   
+
+`roi` is the coordinates of ROI boxes candidates from `ProposalLayer` of shape `[N, rois, 4]`. 
+
+This plugin generates output of shape `[N, keep_topk, 6]` where `keep_topk` is the maximum number of detections left after NMS and '6' means 6 elements of an detection `[y1, x1, y2, x2,
+class_label, score]`
+
+## Parameters
+
+This plugin has the plugin creator class `DetectionlayerPluginCreator` and the plugin class `Detectionlayer`.
+  
+The following parameters were used to create `Detectionlayer` instance:
+
+| Type               | Parameter                          | Description
+|--------------------|------------------------------------|--------------------------------------------------------
+|`int`               |`num_classes`                       |Number of detection classes(including `background`). `num_classes=81` for COCO dataset
+|`int`               |`keep_topk`                         |Number of detections will be kept after NMS.  
+|`float`             |`score_threshold`                   |Confidence threshold value. This plugin will drop a detection if its class confidence(score) is under "score_threshold". 
+|`float`             |`iou_threshold`                     |IOU threshold value used in NMS.
+
+
+## Additional resources
+
+The following resources provide a deeper understanding of the `Detectionlayer` plugin:
+
+- [MaskRCNN](https://github.com/matterport/Mask_RCNN)
+
+
+## License
+
+For terms and conditions for use, reproduction, and distribution, see the [TensorRT Software License Agreement](https://docs.nvidia.com/deeplearning/sdk/tensorrt-sla/index.html) 
+documentation.
+
+
+## Changelog
+
+June 2019
+This is the first release of this `README.md` file.
+
+
+## Known issues
+
+There are no known issues in this plugin.
diff --git a/plugin/detectionLayerPlugin/detectionLayerPlugin.cpp b/plugin/detectionLayerPlugin/detectionLayerPlugin.cpp
new file mode 100644
index 00000000..64a2bec3
--- /dev/null
+++ b/plugin/detectionLayerPlugin/detectionLayerPlugin.cpp
@@ -0,0 +1,310 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "detectionLayerPlugin.h"
+#include "plugin.h"
+#include <cuda_runtime_api.h>
+
+using namespace nvinfer1;
+using namespace plugin;
+using nvinfer1::plugin::DetectionLayer;
+using nvinfer1::plugin::DetectionLayerPluginCreator;
+
+namespace
+{
+const char* DETECTIONLAYER_PLUGIN_VERSION{"1"};
+const char* DETECTIONLAYER_PLUGIN_NAME{"DetectionLayer_TRT"};
+} // namespace
+
+PluginFieldCollection DetectionLayerPluginCreator::mFC{};
+std::vector<PluginField> DetectionLayerPluginCreator::mPluginAttributes;
+
+DetectionLayerPluginCreator::DetectionLayerPluginCreator()
+{
+
+    mPluginAttributes.emplace_back(PluginField("num_classes", nullptr, PluginFieldType::kINT32, 1));
+    mPluginAttributes.emplace_back(PluginField("keep_topk", nullptr, PluginFieldType::kINT32, 1));
+    mPluginAttributes.emplace_back(PluginField("score_threshold", nullptr, PluginFieldType::kFLOAT32, 1));
+    mPluginAttributes.emplace_back(PluginField("iou_threshold", nullptr, PluginFieldType::kFLOAT32, 1));
+
+    mFC.nbFields = mPluginAttributes.size();
+    mFC.fields = mPluginAttributes.data();
+}
+
+const char* DetectionLayerPluginCreator::getPluginName() const
+{
+    return DETECTIONLAYER_PLUGIN_NAME;
+};
+
+const char* DetectionLayerPluginCreator::getPluginVersion() const
+{
+    return DETECTIONLAYER_PLUGIN_VERSION;
+};
+
+const PluginFieldCollection* DetectionLayerPluginCreator::getFieldNames()
+{
+    return &mFC;
+};
+
+IPluginV2Ext* DetectionLayerPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc)
+{
+    const PluginField* fields = fc->fields;
+    for (int i = 0; i < fc->nbFields; ++i)
+    {
+        const char* attrName = fields[i].name;
+        if (!strcmp(attrName, "num_classes"))
+        {
+            assert(fields[i].type == PluginFieldType::kINT32);
+            mNbClasses = *(static_cast<const int*>(fields[i].data));
+        }
+        if (!strcmp(attrName, "keep_topk"))
+        {
+            assert(fields[i].type == PluginFieldType::kINT32);
+            mKeepTopK = *(static_cast<const int*>(fields[i].data));
+        }
+        if (!strcmp(attrName, "score_threshold"))
+        {
+            assert(fields[i].type == PluginFieldType::kFLOAT32);
+            mScoreThreshold = *(static_cast<const float*>(fields[i].data));
+        }
+        if (!strcmp(attrName, "iou_threshold"))
+        {
+            assert(fields[i].type == PluginFieldType::kFLOAT32);
+            mIOUThreshold = *(static_cast<const float*>(fields[i].data));
+        }
+    }
+    return new DetectionLayer(mNbClasses, mKeepTopK, mScoreThreshold, mIOUThreshold);
+};
+
+IPluginV2Ext* DetectionLayerPluginCreator::deserializePlugin(const char* name, const void* data, size_t length)
+{
+    return new DetectionLayer(data, length);
+};
+
+DetectionLayer::DetectionLayer(int num_classes, int keep_topk, float score_threshold, float iou_threshold)
+    : mNbClasses(num_classes)
+    , mKeepTopK(keep_topk)
+    , mScoreThreshold(score_threshold)
+    , mIOUThreshold(iou_threshold)
+{
+    mBackgroundLabel = 0;
+    assert(mNbClasses > 0);
+    assert(mKeepTopK > 0);
+    assert(score_threshold >= 0.0f);
+    assert(iou_threshold > 0.0f);
+
+    mParam.backgroundLabelId = 0;
+    mParam.numClasses = mNbClasses;
+    mParam.keepTopK = mKeepTopK;
+    mParam.scoreThreshold = mScoreThreshold;
+    mParam.iouThreshold = mIOUThreshold;
+
+    mType = DataType::kFLOAT;
+};
+
+int DetectionLayer::getNbOutputs() const
+{
+    return 1;
+};
+
+int DetectionLayer::initialize()
+{
+    //@Init the mValidCnt and mDecodedBboxes for max batch size
+    std::vector<int> tempValidCnt(mMaxBatchSize, mAnchorsCnt);
+
+    mValidCnt = std::make_shared<CudaBind<int>>(mMaxBatchSize);
+
+    CUASSERT(cudaMemcpy(
+        mValidCnt->mPtr, static_cast<void*>(tempValidCnt.data()), sizeof(int) * mMaxBatchSize, cudaMemcpyHostToDevice));
+
+    return 0;
+};
+
+void DetectionLayer::terminate(){};
+
+void DetectionLayer::destroy()
+{
+    delete this;
+};
+
+bool DetectionLayer::supportsFormat(DataType type, PluginFormat format) const
+{
+    return (type == DataType::kFLOAT && format == PluginFormat::kNCHW);
+};
+
+const char* DetectionLayer::getPluginType() const
+{
+    return "DetectionLayer_TRT";
+};
+
+const char* DetectionLayer::getPluginVersion() const
+{
+    return "1";
+};
+
+IPluginV2Ext* DetectionLayer::clone() const
+{
+    return new DetectionLayer(*this);
+};
+
+void DetectionLayer::setPluginNamespace(const char* libNamespace)
+{
+    mNameSpace = libNamespace;
+};
+
+const char* DetectionLayer::getPluginNamespace() const
+{
+    return mNameSpace.c_str();
+}
+
+size_t DetectionLayer::getSerializationSize() const
+{
+    return sizeof(int) * 2 + sizeof(float) * 2 + sizeof(int) * 2;
+};
+
+void DetectionLayer::serialize(void* buffer) const
+{
+    char *d = reinterpret_cast<char*>(buffer), *a = d;
+    write(d, mNbClasses);
+    write(d, mKeepTopK);
+    write(d, mScoreThreshold);
+    write(d, mIOUThreshold);
+    write(d, mMaxBatchSize);
+    write(d, mAnchorsCnt);
+    ASSERT(d == a + getSerializationSize());
+};
+
+DetectionLayer::DetectionLayer(const void* data, size_t length)
+{
+    const char *d = reinterpret_cast<const char*>(data), *a = d;
+    int num_classes = read<int>(d);
+    int keep_topk = read<int>(d);
+    float score_threshold = read<float>(d);
+    float iou_threshold = read<float>(d);
+    mMaxBatchSize = read<int>(d);
+    mAnchorsCnt = read<int>(d);
+    ASSERT(d == a + length);
+
+    mNbClasses = num_classes;
+    mKeepTopK = keep_topk;
+    mScoreThreshold = score_threshold;
+    mIOUThreshold = iou_threshold;
+
+    mParam.backgroundLabelId = 0;
+    mParam.numClasses = mNbClasses;
+    mParam.keepTopK = mKeepTopK;
+    mParam.scoreThreshold = mScoreThreshold;
+    mParam.iouThreshold = mIOUThreshold;
+
+    mType = DataType::kFLOAT;
+};
+
+void DetectionLayer::check_valid_inputs(const nvinfer1::Dims* inputs, int nbInputDims)
+{
+    // classifier_delta_bbox[N, anchors, num_classes*4, 1, 1]
+    // classifier_class[N, anchors, num_classes, 1, 1]
+    // rpn_rois[N, anchors, 4]
+    assert(nbInputDims == 3);
+    // delta_bbox
+    assert(inputs[0].nbDims == 4 && inputs[0].d[1] == mNbClasses * 4);
+    // score
+    assert(inputs[1].nbDims == 4 && inputs[1].d[1] == mNbClasses);
+    // roi
+    assert(inputs[2].nbDims == 2 && inputs[2].d[1] == 4);
+};
+
+size_t DetectionLayer::getWorkspaceSize(int batch_size) const
+{
+    RefineDetectionWorkSpace refine(batch_size, mAnchorsCnt, mParam, mType);
+    return refine.totalSize;
+};
+
+Dims DetectionLayer::getOutputDimensions(int index, const Dims* inputs, int nbInputDims)
+{
+
+    check_valid_inputs(inputs, nbInputDims);
+    assert(index == 0);
+
+    // [N, anchors, (y1, x1, y2, x2, class_id, score)]
+    nvinfer1::Dims detections;
+
+    detections.nbDims = 2;
+    // number of anchors
+    detections.d[0] = mKeepTopK;
+    detections.d[1] = 6;
+
+    return detections;
+}
+
+int DetectionLayer::enqueue(
+    int batch_size, const void* const* inputs, void** outputs, void* workspace, cudaStream_t stream)
+{
+
+    void* detections = outputs[0];
+
+    // refine detection
+    RefineDetectionWorkSpace refDetcWorkspace(batch_size, mAnchorsCnt, mParam, mType);
+    cudaError_t status = RefineBatchClassNMS(stream, batch_size, mAnchorsCnt,
+        DataType::kFLOAT, // mType,
+        mParam, refDetcWorkspace, workspace,
+        inputs[1],       // inputs[InScore]
+        inputs[0],       // inputs[InDelta],
+        mValidCnt->mPtr, // inputs[InCountValid],
+        inputs[2],       // inputs[ROI]
+        detections);
+
+    assert(status == cudaSuccess);
+    return status;
+};
+
+DataType DetectionLayer::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const
+{
+    // Only DataType::kFLOAT is acceptable by the plugin layer
+    return DataType::kFLOAT;
+}
+
+// Return true if output tensor is broadcast across a batch.
+bool DetectionLayer::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const
+{
+    return false;
+}
+
+// Return true if plugin can use input that is broadcast across batch without replication.
+bool DetectionLayer::canBroadcastInputAcrossBatch(int inputIndex) const
+{
+    return false;
+}
+
+// Configure the layer with input and output data types.
+void DetectionLayer::configurePlugin(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs,
+    const DataType* inputTypes, const DataType* outputTypes, const bool* inputIsBroadcast,
+    const bool* outputIsBroadcast, PluginFormat floatFormat, int maxBatchSize)
+{
+    check_valid_inputs(inputDims, nbInputs);
+    assert(inputDims[0].d[0] == inputDims[1].d[0] && inputDims[1].d[0] == inputDims[2].d[0]);
+
+    mAnchorsCnt = inputDims[2].d[0];
+    mType = inputTypes[0];
+    mMaxBatchSize = maxBatchSize;
+}
+
+// Attach the plugin object to an execution context and grant the plugin the access to some context resource.
+void DetectionLayer::attachToContext(
+    cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator)
+{
+}
+
+// Detach the plugin object from its execution context.
+void DetectionLayer::detachFromContext() {}
diff --git a/plugin/detectionLayerPlugin/detectionLayerPlugin.h b/plugin/detectionLayerPlugin/detectionLayerPlugin.h
new file mode 100644
index 00000000..e24ff361
--- /dev/null
+++ b/plugin/detectionLayerPlugin/detectionLayerPlugin.h
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef TRT_DETECTION_LAYER_PLUGIN_H
+#define TRT_DETECTION_LAYER_PLUGIN_H
+#include <cassert>
+#include <cuda_runtime_api.h>
+#include <memory>
+#include <string.h>
+#include <string>
+#include <vector>
+
+#include "NvInfer.h"
+#include "NvInferPlugin.h"
+#include "maskRCNNKernels.h"
+
+namespace nvinfer1
+{
+namespace plugin
+{
+
+class DetectionLayer : public IPluginV2Ext
+{
+public:
+    DetectionLayer(int num_classes, int keep_topk, float score_threshold, float iou_threshold);
+
+    DetectionLayer(const void* data, size_t length);
+
+    ~DetectionLayer() override = default;
+
+    int getNbOutputs() const override;
+
+    Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override;
+
+    int initialize() override;
+
+    void terminate() override;
+
+    void destroy() override;
+
+    size_t getWorkspaceSize(int maxBatchSize) const override;
+
+    int enqueue(
+        int batch_size, const void* const* inputs, void** outputs, void* workspace, cudaStream_t stream) override;
+
+    size_t getSerializationSize() const override;
+
+    void serialize(void* buffer) const override;
+
+    bool supportsFormat(DataType type, PluginFormat format) const override;
+
+    const char* getPluginType() const override;
+
+    const char* getPluginVersion() const override;
+
+    IPluginV2Ext* clone() const override;
+
+    void setPluginNamespace(const char* libNamespace) override;
+
+    const char* getPluginNamespace() const override;
+
+    DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const override;
+
+    bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const override;
+
+    bool canBroadcastInputAcrossBatch(int inputIndex) const override;
+
+    void attachToContext(
+        cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) override;
+
+    void configurePlugin(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs,
+        const DataType* inputTypes, const DataType* outputTypes, const bool* inputIsBroadcast,
+        const bool* outputIsBroadcast, PluginFormat floatFormat, int maxBatchSize) override;
+
+    void detachFromContext() override;
+
+private:
+    void check_valid_inputs(const nvinfer1::Dims* inputs, int nbInputDims);
+
+    int mBackgroundLabel;
+    int mNbClasses;
+    int mKeepTopK;
+    float mScoreThreshold;
+    float mIOUThreshold;
+
+    int mMaxBatchSize;
+    int mAnchorsCnt;
+    std::shared_ptr<CudaBind<int>> mValidCnt; // valid cnt = number of input rois for every image.
+    nvinfer1::DataType mType;
+    RefineNMSParameters mParam;
+
+    std::string mNameSpace;
+};
+
+class DetectionLayerPluginCreator : public BaseCreator
+{
+public:
+    DetectionLayerPluginCreator();
+
+    ~DetectionLayerPluginCreator(){};
+
+    const char* getPluginName() const override;
+
+    const char* getPluginVersion() const override;
+
+    const PluginFieldCollection* getFieldNames() override;
+
+    IPluginV2Ext* createPlugin(const char* name, const PluginFieldCollection* fc) override;
+
+    IPluginV2Ext* deserializePlugin(const char* name, const void* data, size_t length) override;
+
+private:
+    static PluginFieldCollection mFC;
+    int mNbClasses;
+    int mKeepTopK;
+    float mScoreThreshold;
+    float mIOUThreshold;
+    static std::vector<PluginField> mPluginAttributes;
+};
+} // namespace plugin
+} // namespace nvinfer1
+#endif // TRT_DETECTION_LAYER_PLUGIN_H
diff --git a/plugin/exports.map b/plugin/exports.map
index 73e545e6..a65c632c 100644
--- a/plugin/exports.map
+++ b/plugin/exports.map
@@ -17,28 +17,7 @@
 /* Hides all symbols except those specified in the global section */
 {
   global:
-    createInferBuilder_INTERNAL;
-    createInferRuntime_INTERNAL;
-    createInferRefitter_INTERNAL;
-    getInferLibVersion;
-    createRPNROIPlugin;
-    createNormalizePlugin;
-    createPriorBoxPlugin;
-    createAnchorGeneratorPlugin;
-    createNMSPlugin;
-    createLReLUPlugin;
-    createReorgPlugin;
-    createRegionPlugin;
-    createClipPlugin;
-    createBatchedNMSPlugin;
-    getPluginRegistry;
     initLibNvInferPlugins;
-    extern "C++" {
-      nvinfer1::*;
-      nvcaffeparser1::*;
-      nvonnxparser::*;
-      nvuffparser::*;
-    };
   local: *;
 };
 
diff --git a/plugin/instanceNormalizationPlugin/CMakeLists.txt b/plugin/instanceNormalizationPlugin/CMakeLists.txt
new file mode 100644
index 00000000..de8fff17
--- /dev/null
+++ b/plugin/instanceNormalizationPlugin/CMakeLists.txt
@@ -0,0 +1,18 @@
+#
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+file(GLOB SRCS *.cpp)
+set(PLUGIN_SOURCES ${PLUGIN_SOURCES} ${SRCS})
+set(PLUGIN_SOURCES ${PLUGIN_SOURCES} PARENT_SCOPE)
diff --git a/plugin/instanceNormalizationPlugin/README.md b/plugin/instanceNormalizationPlugin/README.md
new file mode 100644
index 00000000..ef29da52
--- /dev/null
+++ b/plugin/instanceNormalizationPlugin/README.md
@@ -0,0 +1,57 @@
+# InstanceNormalizationPlugin
+
+**Table Of Contents**
+- [Description](#description)
+    * [Structure](#structure)
+- [Parameters](#parameters)
+- [Additional resources](#additional-resources)
+- [License](#license)
+- [Changelog](#changelog)
+- [Known issues](#known-issues)
+
+## Description
+
+The `InstanceNormalizePlugin` is used for the InstanceNormalization layer, which is generally used in deep learning models that perform image generation. This plugin is based off the [ONNX opset 6 definition](https://github.com/onnx/onnx/blob/master/docs/Operators.md#InstanceNormalization), and is used in any ONNX model that uses this operation.
+
+Specifically, given an array of values `x = [x_0, x_1, ..., x_n]` , a scale factor, a bias factor, and an epislon,  the InstanceNormalization of x is  `scale * (x-mean) / sqrt(variance + epsilon) + bias` where the mean and variance are computed per instance per channel.
+  
+### Structure
+
+This plugin takes one input and generates one output. The first input is the data from the last layer that is going to be normalized. It has a shape of `[N, C, H, W]`, where `N` is the batch size, `C` is the number of channels, `H` is the height, `W` is the width. 
+
+The dimensions of the output are exactly the same as the input.
+
+## Parameters
+
+This plugin consists of the plugin creator class `InstanceNormalizationPluginCreator` and the plugin class `InstanceNormalizationPlugin`. To create the plugin instance, the following parameters are used:
+
+| Type       | Parameter                | Description
+|------------|--------------------------|--------------------------------------------------------
+|`float`     |`epsilon`                 |A small number to prevent being divided by zero during normalization.
+|`Weights *` |`scale`                   |A pointer to weights which contains information about scale factors for normalization. The definition of `Weights` can be found in the [NvInfer.h](https://docs.nvidia.com/deeplearning/sdk/tensorrt-api/c_api/_nv_infer_8h_source.html) header.
+|`Weights *` |`bias`                    |A pointer to weights which contains information about the bias values for normalization. The definition of `Weights` can be found in the [NvInfer.h](https://docs.nvidia.com/deeplearning/sdk/tensorrt-api/c_api/_nv_infer_8h_source.html) header.
+
+
+## Additional resources
+
+The following resources provide a deeper understanding of the `InstanceNormalizationPlugin` plugin:
+
+**Networks**
+- [ONNX Operator Definition](https://github.com/onnx/onnx/blob/master/docs/Operators.md#InstanceNormalization)    
+- [Instance Normalization Paper](https://arxiv.org/abs/1607.08022)    
+
+## License
+
+For terms and conditions for use, reproduction, and distribution, see the [TensorRT Software License Agreement](https://docs.nvidia.com/deeplearning/sdk/tensorrt-sla/index.html) 
+documentation.
+
+
+## Changelog
+
+September 2019
+This is the first release of this `README.md` file.
+
+
+## Known issues
+
+There are no known issues in this plugin.
diff --git a/plugin/instanceNormalizationPlugin/instanceNormalizationPlugin.cpp b/plugin/instanceNormalizationPlugin/instanceNormalizationPlugin.cpp
new file mode 100644
index 00000000..6bdcccc8
--- /dev/null
+++ b/plugin/instanceNormalizationPlugin/instanceNormalizationPlugin.cpp
@@ -0,0 +1,394 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <stdexcept>
+#include "instanceNormalizationPlugin.h"
+
+using namespace nvinfer1;
+using nvinfer1::plugin::InstanceNormalizationPlugin;
+using nvinfer1::plugin::InstanceNormalizationPluginCreator;
+
+#define CHECK_CUDA(call)                                                                                               \
+    do                                                                                                                 \
+    {                                                                                                                  \
+        cudaError_t status = call;                                                                                     \
+        if (status != cudaSuccess)                                                                                     \
+        {                                                                                                              \
+            return status;                                                                                             \
+        }                                                                                                              \
+    } while (0)
+
+#define CHECK_CUDNN(call)                                                                                              \
+    do                                                                                                                 \
+    {                                                                                                                  \
+        cudnnStatus_t status = call;                                                                                   \
+        if (status != CUDNN_STATUS_SUCCESS)                                                                            \
+        {                                                                                                              \
+            return status;                                                                                             \
+        }                                                                                                              \
+    } while (0)
+
+inline bool is_CHW(nvinfer1::Dims const& dims)
+{
+    return (dims.nbDims == 3 && dims.type[0] == nvinfer1::DimensionType::kCHANNEL
+        && dims.type[1] == nvinfer1::DimensionType::kSPATIAL && dims.type[2] == nvinfer1::DimensionType::kSPATIAL);
+}
+
+// This is derived from: https://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/
+inline float half_to_float_fast(unsigned short value)
+{
+    union F32
+    {
+        unsigned int u;
+        float f;
+    };
+    static const F32 magic = {(254 - 15) << 23};
+    static const F32 was_infnan = {(127 + 16) << 23};
+    F32 result;
+    result.u = (value & 0x7fff) << 13; // exponent/mantissa bits
+    result.f *= magic.f;               // exponent adjust
+    if (result.f >= was_infnan.f)
+    { // make sure Inf/NaN survive
+        result.u |= 255 << 23;
+    }
+    result.u |= (value & 0x8000) << 16; // sign bit
+    return result.f;
+}
+
+cudnnStatus_t convert_trt2cudnn_dtype(nvinfer1::DataType trt_dtype, cudnnDataType_t* cudnn_dtype)
+{
+    switch (trt_dtype)
+    {
+    case nvinfer1::DataType::kFLOAT: *cudnn_dtype = CUDNN_DATA_FLOAT; break;
+    case nvinfer1::DataType::kHALF: *cudnn_dtype = CUDNN_DATA_HALF; break;
+    default: return CUDNN_STATUS_BAD_PARAM;
+    }
+    return CUDNN_STATUS_SUCCESS;
+}
+
+namespace {
+    constexpr const char* INSTANCE_PLUGIN_VERSION{"001"};
+    constexpr const char* INSTANCE_PLUGIN_NAME{"InstanceNormalization_TRT"};
+}
+
+PluginFieldCollection InstanceNormalizationPluginCreator::mFC{};
+std::vector<PluginField> InstanceNormalizationPluginCreator::mPluginAttributes;
+
+InstanceNormalizationPlugin::InstanceNormalizationPlugin(
+    float epsilon, nvinfer1::Weights const& scale, nvinfer1::Weights const& bias)
+    : _epsilon(epsilon)
+    , _nchan(scale.count)
+    , _initialized(false)
+    , _scale(scale)
+    , _bias(bias)
+{
+    ASSERT(scale.count == bias.count);
+    if (scale.type == nvinfer1::DataType::kFLOAT)
+    {
+        _h_scale.assign((float*) scale.values, (float*) scale.values + scale.count);
+    }
+    else if (scale.type == nvinfer1::DataType::kHALF)
+    {
+        _h_scale.reserve(_nchan);
+        for (int c = 0; c < _nchan; ++c)
+        {
+            unsigned short value = ((unsigned short*) scale.values)[c];
+            _h_scale.push_back(half_to_float_fast(value));
+        }
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported scale dtype");
+    }
+    if (bias.type == nvinfer1::DataType::kFLOAT)
+    {
+        _h_bias.assign((float*) bias.values, (float*) bias.values + bias.count);
+    }
+    else if (bias.type == nvinfer1::DataType::kHALF)
+    {
+        _h_bias.reserve(_nchan);
+        for (int c = 0; c < _nchan; ++c)
+        {
+            unsigned short value = ((unsigned short*) bias.values)[c];
+            _h_bias.push_back(half_to_float_fast(value));
+        }
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported bias dtype");
+    }
+}
+
+InstanceNormalizationPlugin::InstanceNormalizationPlugin(void const* serialData, size_t serialLength) : _initialized(false)
+{
+    deserialize_value(&serialData, &serialLength, &_epsilon);
+    deserialize_value(&serialData, &serialLength, &_nchan);
+    deserialize_value(&serialData, &serialLength, &_h_scale);
+    deserialize_value(&serialData, &serialLength, &_h_bias);
+}
+
+InstanceNormalizationPlugin::~InstanceNormalizationPlugin()
+{
+    terminate();
+}
+
+// InstanceNormalizationPlugin returns one output.
+int InstanceNormalizationPlugin::getNbOutputs() const
+{
+    return 1;
+}
+
+DimsExprs InstanceNormalizationPlugin::getOutputDimensions(
+    int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, nvinfer1::IExprBuilder& exprBuilder)
+{
+    nvinfer1::DimsExprs output(inputs[0]);
+    return output;
+}
+
+int InstanceNormalizationPlugin::initialize()
+{
+    _initialized = true;
+    return 0;
+}
+
+void InstanceNormalizationPlugin::terminate()
+{
+    if (!_initialized)
+    {
+        return;
+    }
+    cudnnDestroyTensorDescriptor(_y_desc);
+    cudnnDestroyTensorDescriptor(_x_desc);
+    cudnnDestroyTensorDescriptor(_b_desc);
+    cudaFree(_d_bias);
+    cudaFree(_d_scale);
+    cudnnDestroy(_cudnn_handle);
+    _initialized = false;
+}
+
+size_t InstanceNormalizationPlugin::getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs, const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const 
+{ 
+    return 0; 
+}
+
+
+int InstanceNormalizationPlugin::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+    const nvinfer1::PluginTensorDesc* outputDesc, const void* const* inputs, void* const* outputs, void* workspace,
+    cudaStream_t stream)
+{
+    nvinfer1::Dims input_dims = inputDesc[0].dims;
+    int n = input_dims.d[0];
+    int c = input_dims.d[1];
+    int h = input_dims.d[2];
+    int w = input_dims.d[3];
+    size_t nchan_bytes = c * sizeof(float);
+
+    // Note: We repeat the data for each batch entry so that we can do the full
+    //       computation in a single CUDNN call in enqueue().
+    CHECK_CUDA(cudaMalloc((void**) &_d_scale, n * nchan_bytes));
+    CHECK_CUDA(cudaMalloc((void**) &_d_bias, n * nchan_bytes));
+    for (int i = 0; i < n; ++i)
+    {
+        CHECK_CUDA(cudaMemcpy(_d_scale + i * c, _h_scale.data(), nchan_bytes, cudaMemcpyHostToDevice));
+        CHECK_CUDA(cudaMemcpy(_d_bias + i * c, _h_bias.data(), nchan_bytes, cudaMemcpyHostToDevice));
+    }
+    CHECK_CUDNN(cudnnCreate(&_cudnn_handle));
+    CHECK_CUDNN(cudnnCreateTensorDescriptor(&_b_desc));
+    CHECK_CUDNN(cudnnCreateTensorDescriptor(&_x_desc));
+    CHECK_CUDNN(cudnnCreateTensorDescriptor(&_y_desc));
+
+    CHECK_CUDNN(cudnnSetTensor4dDescriptor(_b_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, n * c, 1, 1));
+    cudnnDataType_t cudnn_dtype;
+    CHECK_CUDNN(convert_trt2cudnn_dtype(inputDesc[0].type, &cudnn_dtype));
+    CHECK_CUDNN(cudnnSetTensor4dDescriptor(_x_desc, CUDNN_TENSOR_NCHW, cudnn_dtype, 1, n * c, h, w));
+    CHECK_CUDNN(cudnnSetTensor4dDescriptor(_y_desc, CUDNN_TENSOR_NCHW, cudnn_dtype, 1, n * c, h, w));
+    float alpha = 1;
+    float beta = 0;
+    void const* x_ptr = inputs[0];
+    void* y_ptr = outputs[0];
+    CHECK_CUDNN(cudnnSetStream(_cudnn_handle, stream));
+    // Note: Use of CUDNN_BATCHNORM_SPATIAL_PERSISTENT can cause numerical
+    //       overflows (NaNs) for fp32 data in some circumstances. The lower-
+    //       performance CUDNN_BATCHNORM_SPATIAL should be used if this is not
+    //       acceptable.
+    CHECK_CUDNN(cudnnBatchNormalizationForwardTraining(_cudnn_handle, CUDNN_BATCHNORM_SPATIAL_PERSISTENT, &alpha, &beta,
+        _x_desc, x_ptr, _y_desc, y_ptr, _b_desc, _d_scale, _d_bias, 1., nullptr, nullptr, _epsilon, nullptr, nullptr));
+    return 0;
+}
+
+size_t InstanceNormalizationPlugin::getSerializationSize() const
+{
+    return (serialized_size(_epsilon) +
+            serialized_size(_nchan) +
+            serialized_size(_h_scale) +
+            serialized_size(_h_bias));
+}
+
+void InstanceNormalizationPlugin::serialize(void *buffer) const
+{
+    serialize_value(&buffer, _epsilon);
+    serialize_value(&buffer, _nchan);
+    serialize_value(&buffer, _h_scale);
+    serialize_value(&buffer, _h_bias);
+}
+
+bool InstanceNormalizationPlugin::supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs)
+{
+    ASSERT(inOut && pos < (nbInputs + nbOutputs));
+    return ((inOut[pos].type == nvinfer1::DataType::kFLOAT || inOut[pos].type == nvinfer1::DataType::kHALF)
+        && inOut[pos].format == nvinfer1::PluginFormat::kNCHW);
+}
+
+const char* InstanceNormalizationPlugin::getPluginType() const
+{
+    return INSTANCE_PLUGIN_NAME;
+}
+
+const char* InstanceNormalizationPlugin::getPluginVersion() const
+{
+    return INSTANCE_PLUGIN_VERSION;
+}
+
+void InstanceNormalizationPlugin::destroy()
+{ 
+    delete this;
+}
+
+IPluginV2DynamicExt* InstanceNormalizationPlugin::clone() const
+{ 
+    return new InstanceNormalizationPlugin{_epsilon, _scale, _bias}; 
+}
+
+// Set plugin namespace
+void InstanceNormalizationPlugin::setPluginNamespace(const char* pluginNamespace)
+{
+    mPluginNamespace = pluginNamespace;
+}
+
+const char* InstanceNormalizationPlugin::getPluginNamespace() const
+{
+    return mPluginNamespace;
+}
+
+nvinfer1::DataType InstanceNormalizationPlugin::getOutputDataType(
+    int index, const nvinfer1::DataType* inputTypes, int nbInputs) const
+{
+    ASSERT(inputTypes && nbInputs > 0 && index == 0);
+    return inputTypes[0];
+}
+
+// Attach the plugin object to an execution context and grant the plugin the access to some context resource.
+void InstanceNormalizationPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator)
+{
+}
+
+// Detach the plugin object from its execution context.
+void InstanceNormalizationPlugin::detachFromContext() {}
+
+void InstanceNormalizationPlugin::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs,
+    const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs)
+{
+    for (int i = 0; i < nbInputs; i++)
+    {
+      for (int j = 0; j < in[0].desc.dims.nbDims; j++)
+      {
+        // Do not support dynamic dimensions
+        ASSERT(in[0].desc.dims.d[j] != -1);
+      }
+    }
+}
+
+// InstanceNormalizationPluginCreator methods
+InstanceNormalizationPluginCreator::InstanceNormalizationPluginCreator()
+{
+    mPluginAttributes.emplace_back(PluginField("epsilon", nullptr, PluginFieldType::kFLOAT32, 1));
+    mPluginAttributes.emplace_back(PluginField("scales", nullptr, PluginFieldType::kFLOAT32, 1));
+    mPluginAttributes.emplace_back(PluginField("bias", nullptr, PluginFieldType::kFLOAT32, 1));
+
+    mFC.nbFields = mPluginAttributes.size();
+    mFC.fields = mPluginAttributes.data();
+}
+
+const char* InstanceNormalizationPluginCreator::getPluginName() const
+{
+    return INSTANCE_PLUGIN_NAME;
+}
+
+const char* InstanceNormalizationPluginCreator::getPluginVersion() const
+{
+    return INSTANCE_PLUGIN_VERSION;
+}
+
+const PluginFieldCollection* InstanceNormalizationPluginCreator::getFieldNames()
+{
+    return &mFC;
+}
+
+IPluginV2DynamicExt* InstanceNormalizationPluginCreator::createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc)
+{
+    std::vector<float> scaleValues;
+    std::vector<float> biasValues;
+    float epsilon {};
+    const PluginField* fields = fc->fields;
+    for (int i = 0; i < fc->nbFields; ++i)
+    {
+        const char* attrName = fields[i].name;
+        if (!strcmp(attrName, "epsilon"))
+        {
+            ASSERT(fields[i].type == PluginFieldType::kFLOAT32);
+            epsilon= *(static_cast<const float*>(fields[i].data));
+        }
+        else if (!strcmp(attrName, "scales"))
+        {
+            ASSERT(fields[i].type == PluginFieldType::kFLOAT32);
+            int size = fields[i].length;
+            scaleValues.reserve(size);
+            const auto* w = static_cast<const float*>(fields[i].data);
+            for (int j = 0; j < size; j++)
+            {
+                scaleValues.push_back(*w);
+                w++;
+            }
+        }
+        else if (!strcmp(attrName, "bias"))
+        {
+            ASSERT(fields[i].type == PluginFieldType::kFLOAT32);
+            int size = fields[i].length;
+            biasValues.reserve(size);
+            const auto* w = static_cast<const float*>(fields[i].data);
+            for (int j = 0; j < size; j++)
+            {
+                biasValues.push_back(*w);
+                w++;
+            }
+        }
+    }
+
+    Weights scaleWeights{DataType::kFLOAT, scaleValues.data(), (int64_t) scaleValues.size()};
+    Weights biasWeights{DataType::kFLOAT, biasValues.data(), (int64_t) biasValues.size()};
+
+    InstanceNormalizationPlugin* obj = new InstanceNormalizationPlugin(epsilon, scaleWeights, biasWeights);
+    obj->setPluginNamespace(mNamespace.c_str());
+    return obj;
+}
+
+// TO TEST:
+IPluginV2DynamicExt* InstanceNormalizationPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength)
+{
+    InstanceNormalizationPlugin* obj = new InstanceNormalizationPlugin{serialData, serialLength}; 
+    obj->setPluginNamespace(mNamespace.c_str());
+    return obj;
+}
diff --git a/plugin/instanceNormalizationPlugin/instanceNormalizationPlugin.h b/plugin/instanceNormalizationPlugin/instanceNormalizationPlugin.h
new file mode 100644
index 00000000..bc077e35
--- /dev/null
+++ b/plugin/instanceNormalizationPlugin/instanceNormalizationPlugin.h
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef TRT_INSTANCE_NORMALIZATION_PLUGIN_H
+#define TRT_INSTANCE_NORMALIZATION_PLUGIN_H
+#include "serialize.hpp"
+#include "plugin.h"
+#include <cudnn.h>
+#include <vector>
+#include <iostream>
+#include <string>
+
+typedef unsigned short half_type;
+
+namespace nvinfer1 
+{
+namespace plugin 
+{
+class InstanceNormalizationPlugin final : public nvinfer1::IPluginV2DynamicExt
+{
+
+public:
+  InstanceNormalizationPlugin(float epsilon, nvinfer1::Weights const& scale, nvinfer1::Weights const& bias);
+  InstanceNormalizationPlugin(void const* serialData, size_t serialLength);
+
+  InstanceNormalizationPlugin() = delete;
+
+  ~InstanceNormalizationPlugin() override;
+
+  int getNbOutputs() const override;
+
+  // DynamicExt plugins returns DimsExprs class instead of Dims
+  DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, nvinfer1::IExprBuilder& exprBuilder) override;
+
+  int initialize() override;
+
+  void terminate() override;
+
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs, const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const override;
+
+  int enqueue(const nvinfer1::PluginTensorDesc* inputDesc, const nvinfer1::PluginTensorDesc* outputDesc, 
+              const void* const* inputs, void* const* outputs, 
+              void* workspace, 
+              cudaStream_t stream) override;
+
+  size_t getSerializationSize() const override;
+
+  void serialize(void* buffer) const override;
+
+  // DynamicExt plugin supportsFormat update.
+  bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) override;
+
+  const char* getPluginType() const override;
+
+  const char* getPluginVersion() const override;
+
+  void destroy() override;
+
+  nvinfer1::IPluginV2DynamicExt* clone() const override;
+
+  void setPluginNamespace(const char* pluginNamespace) override;
+
+  const char* getPluginNamespace() const override;
+
+  DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const override;
+
+  void attachToContext(cudnnContext* cudnn, cublasContext* cublas, nvinfer1::IGpuAllocator* allocator) override;
+
+  void detachFromContext() override;
+
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs, 
+                       const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) override;
+private:
+    float _epsilon;
+    int   _nchan;
+    std::vector<float> _h_scale;
+    std::vector<float> _h_bias;
+    float* _d_scale;
+    float* _d_bias;
+    bool _initialized;
+    nvinfer1::Weights _scale, _bias;
+    cudnnHandle_t _cudnn_handle;
+    cudnnTensorDescriptor_t _x_desc, _y_desc, _b_desc;
+    const char* mPluginNamespace;
+    std::string mNamespace;
+};
+
+class InstanceNormalizationPluginCreator : public BaseCreator
+{
+public:
+  InstanceNormalizationPluginCreator();
+
+  ~InstanceNormalizationPluginCreator() override = default;
+
+  const char* getPluginName() const override;
+
+  const char* getPluginVersion() const override;
+
+  const PluginFieldCollection* getFieldNames() override;
+
+  IPluginV2DynamicExt* createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc) override;
+
+  IPluginV2DynamicExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) override;
+
+private:
+  static PluginFieldCollection mFC;
+  static std::vector<PluginField> mPluginAttributes;
+  std::string mNamespace;
+};
+} //namespace plugin
+} //namespace nvinfer1
+
+#endif // TRT_INSTANCE_NORMALIZATION_PLUGIN_H
diff --git a/plugin/normalizePlugin/normalizePlugin.cpp b/plugin/normalizePlugin/normalizePlugin.cpp
index a3932c96..bbc34fcc 100644
--- a/plugin/normalizePlugin/normalizePlugin.cpp
+++ b/plugin/normalizePlugin/normalizePlugin.cpp
@@ -244,7 +244,7 @@ void Normalize::destroy()
 IPluginV2Ext* Normalize::clone() const
 {
     // Create a new instance
-    IPluginV2Ext* plugin = new Normalize(&mWeights, mNbWeights, acrossSpatial, channelShared, eps);
+    IPluginV2Ext* plugin = new Normalize(&mWeights, mNbWeights, acrossSpatial, channelShared, eps, C, H, W);
 
     // Set the namespace
     plugin->setPluginNamespace(mPluginNamespace);
diff --git a/plugin/normalizePlugin/normalizePlugin.h b/plugin/normalizePlugin/normalizePlugin.h
index 7391c16f..b4c39da5 100644
--- a/plugin/normalizePlugin/normalizePlugin.h
+++ b/plugin/normalizePlugin/normalizePlugin.h
@@ -92,11 +92,14 @@ class Normalize : public IPluginV2Ext
 
     cublasHandle_t mCublas{};
 
-    int C, H, W, mNbWeights;
-    bool acrossSpatial;
-    bool channelShared;
-    float eps;
-    Weights mWeights;
+    int C{};
+    int H{};
+    int W{};
+    int mNbWeights{};
+    bool acrossSpatial{};
+    bool channelShared{};
+    float eps{};
+    Weights mWeights{};
     const char* mPluginNamespace;
 };
 
@@ -119,9 +122,10 @@ class NormalizePluginCreator : public BaseCreator
 
 private:
     static PluginFieldCollection mFC;
-    bool mAcrossSpatial, mChannelShared;
-    float mEps;
-    int mNbWeights;
+    bool mAcrossSpatial{};
+    bool mChannelShared{};
+    float mEps{};
+    int mNbWeights{};
     static std::vector<PluginField> mPluginAttributes;
 };
 } // namespace plugin
diff --git a/plugin/proposalLayerPlugin/CMakeLists.txt b/plugin/proposalLayerPlugin/CMakeLists.txt
new file mode 100644
index 00000000..1dbe788c
--- /dev/null
+++ b/plugin/proposalLayerPlugin/CMakeLists.txt
@@ -0,0 +1,18 @@
+#
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+file(GLOB SRCS *.cpp *.cu)
+set(PLUGIN_SOURCES ${PLUGIN_SOURCES} ${SRCS})
+set(PLUGIN_SOURCES ${PLUGIN_SOURCES} PARENT_SCOPE)
diff --git a/plugin/proposalLayerPlugin/README.md b/plugin/proposalLayerPlugin/README.md
new file mode 100644
index 00000000..d76d1896
--- /dev/null
+++ b/plugin/proposalLayerPlugin/README.md
@@ -0,0 +1,76 @@
+# ProposalLayer
+
+**Table Of Contents**
+- [Description](#description)
+    * [Structure](#structure)
+- [Parameters](#parameters)
+- [Additional resources](#additional-resources)
+- [License](#license)
+- [Changelog](#changelog)
+- [Known issues](#known-issues)
+
+## Description
+
+The `ProposalLayer` plugin generates the first-stage detection (ROI candidates) out of the scores, refinement info from RPN(Region Proposal Network) and pre-defined anchors. It is
+used in sampleMaskRCNN.   
+
+
+### Structure
+
+This plugin supports the NCHW format. It takes two input tenosrs: `object_score` and `object_delta` 
+
+`object_score` is the objectness score from RPN. `objetc_score`'s shape is `[N, anchors, 2, 1]` where `N` is the batch_size, `anchors` is the total number of anchors and `2` means 2
+classes of objectness --- foreground and background . 
+
+`object_delta` is the refinement info from RPN of shape `[N, anchors, 4, 1]`. `4` means 4 elements of refinement information --- `[dy, dx, dh, dw]`
+
+This plugin generates one output tensor of shape `[N, keep_topk, 4]` where `keep_topk` is the maximum number of detections left after NMS and `4` means coordinates of ROI
+candidates `[y1, x1, y2, x2]`
+
+Instead of fed as input in Keras, the default anchors are generated in this plugin during `initialization`.   
+For resnet101 + 1024*1024 input shape, the number of anchors can be computed as 
+```
+Anchors in feature map P2: 256*256*3 
+Anchors in feature map P3: 128*128*3
+Anchors in feature map P4: 64*64*3
+Anchors in feature map P5: 32*32*3
+Anchors in feature map P6(maxpooling): 16*16*3  
+
+total number of anchors: 87296*3 = 261888
+```
+
+## Parameters
+
+This plugin has the plugin creator class `ProposalLayerPluginCreator` and the plugin class `ProposalLayer`.
+  
+The following parameters were used to create `ProposalLayer` instance:
+
+| Type              | Parameter                        | Description
+|-------------------|----------------------------------|--------------------------------------------------------
+|`int`              |`prenms_topk`                     |The number of ROIs which will be kept before NMS. 
+|`int`              |`keep_topk`                       |Number of detections will be kept after NMS.
+|`float`            |`iou_threshold`                   |IOU threshold value used in NMS.
+
+
+## Additional resources
+
+The following resources provide a deeper understanding of the `ProposalLayer` plugin:
+
+- [MaskRCNN](https://github.com/matterport/Mask_RCNN)
+
+
+## License
+
+For terms and conditions for use, reproduction, and distribution, see the [TensorRT Software License Agreement](https://docs.nvidia.com/deeplearning/sdk/tensorrt-sla/index.html) 
+documentation.
+
+
+## Changelog
+
+June 2019
+This is the first release of this `README.md` file.
+
+
+## Known issues
+
+There are no known issues in this plugin.
diff --git a/plugin/proposalLayerPlugin/mrcnn_config.h b/plugin/proposalLayerPlugin/mrcnn_config.h
new file mode 100644
index 00000000..9cdfae16
--- /dev/null
+++ b/plugin/proposalLayerPlugin/mrcnn_config.h
@@ -0,0 +1,180 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MASKRCNN_CONFIG_HEADER
+#define MASKRCNN_CONFIG_HEADER
+#include "NvInfer.h"
+#include <string>
+#include <vector>
+using namespace nvinfer1;
+
+namespace MaskRCNNConfig
+{
+static const nvinfer1::DimsCHW IMAGE_SHAPE{3, 1024, 1024};
+
+// Pooled ROIs
+static const int POOL_SIZE = 7;
+static const int MASK_POOL_SIZE = 14;
+
+// Threshold to determine the mask area out of final convolution output
+static const float MASK_THRESHOLD = 0.5;
+
+// Bounding box refinement standard deviation for RPN and final detections.
+static const float RPN_BBOX_STD_DEV[] = {0.1, 0.1, 0.2, 0.2};
+static const float BBOX_STD_DEV[] = {0.1, 0.1, 0.2, 0.2};
+
+// Max number of final detections
+static const int DETECTION_MAX_INSTANCES = 100;
+
+// Minimum probability value to accept a detected instance
+// ROIs below this threshold are skipped
+static const float DETECTION_MIN_CONFIDENCE = 0.7;
+
+// Non-maximum suppression threshold for detection
+static const float DETECTION_NMS_THRESHOLD = 0.3;
+
+// The strides of each layer of the FPN Pyramid. These values
+// are based on a Resnet101 backbone.
+static const std::vector<float> BACKBONE_STRIDES = {4, 8, 16, 32, 64};
+
+// Size of the fully-connected layers in the classification graph
+static const int FPN_CLASSIF_FC_LAYERS_SIZE = 1024;
+
+// Size of the top-down layers used to build the feature pyramid
+static const int TOP_DOWN_PYRAMID_SIZE = 256;
+
+// Number of classification classes (including background)
+static const int NUM_CLASSES = 1 + 80; // COCO has 80 classes
+
+// Length of square anchor side in pixels
+static const std::vector<float> RPN_ANCHOR_SCALES = {32, 64, 128, 256, 512};
+
+// Ratios of anchors at each cell (width/height)
+// A value of 1 represents a square anchor, and 0.5 is a wide anchor
+static const float RPN_ANCHOR_RATIOS[] = {0.5, 1, 2};
+
+// Anchor stride
+// If 1 then anchors are created for each cell in the backbone feature map.
+// If 2, then anchors are created for every other cell, and so on.
+static const int RPN_ANCHOR_STRIDE = 1;
+
+// Although Python impementation uses 6000,
+//  TRT fails if this number larger than MAX_TOPK_K defined in engine/checkMacros.h
+static const int MAX_PRE_NMS_RESULTS = 1024; // 3840;
+
+// Non-max suppression threshold to filter RPN proposals.
+// You can increase this during training to generate more propsals.
+static const float RPN_NMS_THRESHOLD = 0.7;
+
+// ROIs kept after non-maximum suppression (training and inference)
+static const int POST_NMS_ROIS_INFERENCE = 1000;
+
+// COCO Class names
+static const std::vector<std::string> CLASS_NAMES = {
+    "BG",
+    "person",
+    "bicycle",
+    "car",
+    "motorcycle",
+    "airplane",
+    "bus",
+    "train",
+    "truck",
+    "boat",
+    "traffic light",
+    "fire hydrant",
+    "stop sign",
+    "parking meter",
+    "bench",
+    "bird",
+    "cat",
+    "dog",
+    "horse",
+    "sheep",
+    "cow",
+    "elephant",
+    "bear",
+    "zebra",
+    "giraffe",
+    "backpack",
+    "umbrella",
+    "handbag",
+    "tie",
+    "suitcase",
+    "frisbee",
+    "skis",
+    "snowboard",
+    "sports ball",
+    "kite",
+    "baseball bat",
+    "baseball glove",
+    "skateboard",
+    "surfboard",
+    "tennis racket",
+    "bottle",
+    "wine glass",
+    "cup",
+    "fork",
+    "knife",
+    "spoon",
+    "bowl",
+    "banana",
+    "apple",
+    "sandwich",
+    "orange",
+    "broccoli",
+    "carrot",
+    "hot dog",
+    "pizza",
+    "donut",
+    "cake",
+    "chair",
+    "couch",
+    "potted plant",
+    "bed",
+    "dining table",
+    "toilet",
+    "tv",
+    "laptop",
+    "mouse",
+    "remote",
+    "keyboard",
+    "cell phone",
+    "microwave",
+    "oven",
+    "toaster",
+    "sink",
+    "refrigerator",
+    "book",
+    "clock",
+    "vase",
+    "scissors",
+    "teddy bear",
+    "hair drier",
+    "toothbrush",
+};
+
+static const std::string MODEL_NAME = "mrcnn_nchw.uff";
+//"input_anchors" is not working as input for .uff model.
+// The anchors are generated in proposallayer plugin.for
+// resnet101 + 1024*1024 input shape: (FP2: 256*256 + FP3: 128*128 + FP4:
+// 64*64 + FP5: 32*32 + FP6(maxpooling) 16*16 = 87296)
+static const std::string MODEL_INPUT = "input_image";
+static const DimsCHW MODEL_INPUT_SHAPE = IMAGE_SHAPE;
+static const std::vector<std::string> MODEL_OUTPUTS = {"mrcnn_detection", "mrcnn_mask/Sigmoid"};
+static const Dims2 MODEL_DETECTION_SHAPE{DETECTION_MAX_INSTANCES, 6};
+static const Dims4 MODEL_MASK_SHAPE{DETECTION_MAX_INSTANCES, NUM_CLASSES, 28, 28};
+} // namespace MaskRCNNConfig
+#endif
diff --git a/plugin/proposalLayerPlugin/proposalLayerPlugin.cpp b/plugin/proposalLayerPlugin/proposalLayerPlugin.cpp
new file mode 100644
index 00000000..a5a32bb7
--- /dev/null
+++ b/plugin/proposalLayerPlugin/proposalLayerPlugin.cpp
@@ -0,0 +1,355 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "proposalLayerPlugin.h"
+#include "mrcnn_config.h"
+#include "plugin.h"
+#include <cuda_runtime_api.h>
+#include <iostream>
+#include <math.h>
+
+using namespace nvinfer1;
+using namespace plugin;
+using nvinfer1::plugin::ProposalLayer;
+using nvinfer1::plugin::ProposalLayerPluginCreator;
+
+namespace
+{
+const char* PROPOSALLAYER_PLUGIN_VERSION{"1"};
+const char* PROPOSALLAYER_PLUGIN_NAME{"ProposalLayer_TRT"};
+} // namespace
+
+PluginFieldCollection ProposalLayerPluginCreator::mFC{};
+std::vector<PluginField> ProposalLayerPluginCreator::mPluginAttributes;
+
+ProposalLayerPluginCreator::ProposalLayerPluginCreator()
+{
+
+    mPluginAttributes.emplace_back(PluginField("prenms_topk", nullptr, PluginFieldType::kINT32, 1));
+    mPluginAttributes.emplace_back(PluginField("keep_topk", nullptr, PluginFieldType::kINT32, 1));
+    mPluginAttributes.emplace_back(PluginField("iou_threshold", nullptr, PluginFieldType::kFLOAT32, 1));
+
+    mFC.nbFields = mPluginAttributes.size();
+    mFC.fields = mPluginAttributes.data();
+}
+
+const char* ProposalLayerPluginCreator::getPluginName() const
+{
+    return PROPOSALLAYER_PLUGIN_NAME;
+};
+
+const char* ProposalLayerPluginCreator::getPluginVersion() const
+{
+    return PROPOSALLAYER_PLUGIN_VERSION;
+};
+
+const PluginFieldCollection* ProposalLayerPluginCreator::getFieldNames()
+{
+    return &mFC;
+};
+
+IPluginV2Ext* ProposalLayerPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc)
+{
+    const PluginField* fields = fc->fields;
+    for (int i = 0; i < fc->nbFields; ++i)
+    {
+        const char* attrName = fields[i].name;
+        if (!strcmp(attrName, "prenms_topk"))
+        {
+            assert(fields[i].type == PluginFieldType::kINT32);
+            mPreNMSTopK = *(static_cast<const int*>(fields[i].data));
+        }
+        if (!strcmp(attrName, "keep_topk"))
+        {
+            assert(fields[i].type == PluginFieldType::kINT32);
+            mKeepTopK = *(static_cast<const int*>(fields[i].data));
+        }
+        if (!strcmp(attrName, "iou_threshold"))
+        {
+            assert(fields[i].type == PluginFieldType::kFLOAT32);
+            mIOUThreshold = *(static_cast<const float*>(fields[i].data));
+        }
+    }
+    return new ProposalLayer(mPreNMSTopK, mKeepTopK, mIOUThreshold);
+};
+
+IPluginV2Ext* ProposalLayerPluginCreator::deserializePlugin(const char* name, const void* data, size_t length)
+{
+    return new ProposalLayer(data, length);
+};
+
+ProposalLayer::ProposalLayer(int prenms_topk, int keep_topk, float iou_threshold)
+    : mPreNMSTopK(prenms_topk)
+    , mKeepTopK(keep_topk)
+    , mIOUThreshold(iou_threshold)
+{
+    mBackgroundLabel = -1;
+    assert(mPreNMSTopK > 0);
+    assert(mKeepTopK > 0);
+    assert(iou_threshold > 0.0f);
+
+    mParam.backgroundLabelId = -1;
+    mParam.numClasses = 1;
+    mParam.keepTopK = mKeepTopK;
+    mParam.scoreThreshold = 0.0;
+    mParam.iouThreshold = mIOUThreshold;
+
+    mType = DataType::kFLOAT;
+
+    generate_pyramid_anchors();
+};
+
+int ProposalLayer::getNbOutputs() const
+{
+    return 1;
+};
+
+int ProposalLayer::initialize()
+{
+    // Init the mValidCnt of max batch size
+    std::vector<int> tempValidCnt(mMaxBatchSize, mPreNMSTopK);
+
+    mValidCnt = std::make_shared<CudaBind<int>>(mMaxBatchSize);
+
+    CUASSERT(cudaMemcpy(
+        mValidCnt->mPtr, static_cast<void*>(tempValidCnt.data()), sizeof(int) * mMaxBatchSize, cudaMemcpyHostToDevice));
+
+    // Init the anchors for batch size:
+    mAnchorBoxesDevice = std::make_shared<CudaBind<float>>(mAnchorsCnt * 4 * mMaxBatchSize);
+    int batch_offset = sizeof(float) * mAnchorsCnt * 4;
+    uint8_t* device_ptr = static_cast<uint8_t*>(mAnchorBoxesDevice->mPtr);
+    for (int i = 0; i < mMaxBatchSize; i++)
+    {
+        CUASSERT(cudaMemcpy(static_cast<void*>(device_ptr + i * batch_offset),
+            static_cast<void*>(mAnchorBoxesHost.data()), batch_offset, cudaMemcpyHostToDevice));
+    }
+
+    return 0;
+};
+
+void ProposalLayer::terminate(){};
+
+void ProposalLayer::destroy()
+{
+    delete this;
+};
+
+bool ProposalLayer::supportsFormat(DataType type, PluginFormat format) const
+{
+    return (type == DataType::kFLOAT && format == PluginFormat::kNCHW);
+};
+
+const char* ProposalLayer::getPluginType() const
+{
+    return "ProposalLayer_TRT";
+};
+
+const char* ProposalLayer::getPluginVersion() const
+{
+    return "1";
+};
+
+IPluginV2Ext* ProposalLayer::clone() const
+{
+    return new ProposalLayer(*this);
+};
+
+void ProposalLayer::setPluginNamespace(const char* libNamespace)
+{
+    mNameSpace = libNamespace;
+};
+
+const char* ProposalLayer::getPluginNamespace() const
+{
+    return mNameSpace.c_str();
+};
+
+size_t ProposalLayer::getSerializationSize() const
+{
+    return sizeof(int) * 2 + sizeof(float) + sizeof(int) * 2;
+};
+
+void ProposalLayer::serialize(void* buffer) const
+{
+    char *d = reinterpret_cast<char*>(buffer), *a = d;
+    write(d, mPreNMSTopK);
+    write(d, mKeepTopK);
+    write(d, mIOUThreshold);
+    write(d, mMaxBatchSize);
+    write(d, mAnchorsCnt);
+    ASSERT(d == a + getSerializationSize());
+};
+
+ProposalLayer::ProposalLayer(const void* data, size_t length)
+{
+    const char *d = reinterpret_cast<const char*>(data), *a = d;
+    int prenms_topk = read<int>(d);
+    int keep_topk = read<int>(d);
+    float iou_threshold = read<float>(d);
+    mMaxBatchSize = read<int>(d);
+    mAnchorsCnt = read<int>(d);
+    ASSERT(d == a + length);
+
+    mBackgroundLabel = -1;
+    mPreNMSTopK = prenms_topk;
+    mKeepTopK = keep_topk;
+    mScoreThreshold = 0.0;
+    mIOUThreshold = iou_threshold;
+
+    mParam.backgroundLabelId = -1;
+    mParam.numClasses = 1;
+    mParam.keepTopK = mKeepTopK;
+    mParam.scoreThreshold = 0.0;
+    mParam.iouThreshold = mIOUThreshold;
+
+    mType = DataType::kFLOAT;
+
+    generate_pyramid_anchors();
+};
+
+void ProposalLayer::check_valid_inputs(const nvinfer1::Dims* inputs, int nbInputDims)
+{
+    // object_score[N, anchors, 2, 1],
+    // foreground_delta[N, anchors, 4, 1],
+    // anchors should be generated inside
+    assert(nbInputDims == 2);
+    // foreground_score
+    assert(inputs[0].nbDims == 3 && inputs[0].d[1] == 2);
+    // foreground_delta
+    assert(inputs[1].nbDims == 3 && inputs[1].d[1] == 4);
+};
+
+size_t ProposalLayer::getWorkspaceSize(int batch_size) const
+{
+
+    ProposalWorkSpace proposal(batch_size, mAnchorsCnt, mPreNMSTopK, mParam, mType);
+    return proposal.totalSize;
+};
+
+Dims ProposalLayer::getOutputDimensions(int index, const Dims* inputs, int nbInputDims)
+{
+
+    check_valid_inputs(inputs, nbInputDims);
+    assert(index == 0);
+
+    // [N, anchors, (y1, x1, y2, x2)]
+    nvinfer1::Dims proposals;
+
+    proposals.nbDims = 2;
+    // number of keeping anchors
+    proposals.d[0] = mKeepTopK;
+    proposals.d[1] = 4;
+
+    return proposals;
+}
+
+void ProposalLayer::generate_pyramid_anchors()
+{
+    const auto image_dims = MaskRCNNConfig::IMAGE_SHAPE;
+
+    const auto& scales = MaskRCNNConfig::RPN_ANCHOR_SCALES;
+    const auto& ratios = MaskRCNNConfig::RPN_ANCHOR_RATIOS;
+    const auto& strides = MaskRCNNConfig::BACKBONE_STRIDES;
+    auto anchor_stride = MaskRCNNConfig::RPN_ANCHOR_STRIDE;
+
+    const float cy = image_dims.d[1] - 1;
+    const float cx = image_dims.d[2] - 1;
+
+    auto& anchors = mAnchorBoxesHost;
+    assert(anchors.size() == 0);
+
+    assert(scales.size() == strides.size());
+    for (size_t s = 0; s < scales.size(); ++s)
+    {
+        float scale = scales[s];
+        int stride = strides[s];
+
+        for (int y = 0; y < image_dims.d[1]; y += anchor_stride * stride)
+            for (int x = 0; x < image_dims.d[2]; x += anchor_stride * stride)
+                for (float r : ratios)
+                {
+                    float sqrt_r = sqrt(r);
+                    float h = scale / sqrt_r;
+                    float w = scale * sqrt_r;
+
+                    anchors.insert(anchors.end(),
+                        {(y - h / 2) / cy, (x - w / 2) / cx, (y + h / 2 - 1) / cy, (x + w / 2 - 1) / cx});
+                }
+    }
+
+    assert(anchors.size() % 4 == 0);
+}
+
+int ProposalLayer::enqueue(
+    int batch_size, const void* const* inputs, void** outputs, void* workspace, cudaStream_t stream)
+{
+
+    void* proposals = outputs[0];
+
+    // proposal
+    ProposalWorkSpace proposalWorkspace(batch_size, mAnchorsCnt, mPreNMSTopK, mParam, mType);
+    cudaError_t status = proposalRefineBatchClassNMS(stream, batch_size, mAnchorsCnt, mPreNMSTopK,
+        DataType::kFLOAT, // mType,
+        mParam, proposalWorkspace, workspace,
+        inputs[0], // inputs[object_score]
+        inputs[1], // inputs[bbox_delta],
+        mValidCnt->mPtr,
+        mAnchorBoxesDevice->mPtr, // inputs[anchors]
+        proposals);
+
+    assert(status == cudaSuccess);
+    return status;
+};
+
+// Return the DataType of the plugin output at the requested index
+DataType ProposalLayer::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const
+{
+    // Only DataType::kFLOAT is acceptable by the plugin layer
+    return DataType::kFLOAT;
+}
+
+// Return true if output tensor is broadcast across a batch.
+bool ProposalLayer::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const
+{
+    return false;
+}
+
+// Return true if plugin can use input that is broadcast across batch without replication.
+bool ProposalLayer::canBroadcastInputAcrossBatch(int inputIndex) const
+{
+    return false;
+}
+
+// Configure the layer with input and output data types.
+void ProposalLayer::configurePlugin(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs,
+    const DataType* inputTypes, const DataType* outputTypes, const bool* inputIsBroadcast,
+    const bool* outputIsBroadcast, PluginFormat floatFormat, int maxBatchSize)
+{
+    check_valid_inputs(inputDims, nbInputs);
+    assert(inputDims[0].d[0] == inputDims[1].d[0]);
+
+    mAnchorsCnt = inputDims[0].d[0];
+    assert(mAnchorsCnt == (int) (mAnchorBoxesHost.size() / 4));
+    mMaxBatchSize = maxBatchSize;
+}
+
+// Attach the plugin object to an execution context and grant the plugin the access to some context resource.
+void ProposalLayer::attachToContext(
+    cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator)
+{
+}
+
+// Detach the plugin object from its execution context.
+void ProposalLayer::detachFromContext() {}
diff --git a/plugin/proposalLayerPlugin/proposalLayerPlugin.h b/plugin/proposalLayerPlugin/proposalLayerPlugin.h
new file mode 100644
index 00000000..1ddf9b65
--- /dev/null
+++ b/plugin/proposalLayerPlugin/proposalLayerPlugin.h
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef TRT_PROPOSAL_LAYER_PLUGIN_H
+#define TRT_PROPOSAL_LAYER_PLUGIN_H
+#include <cassert>
+#include <cuda_runtime_api.h>
+#include <memory>
+#include <string.h>
+#include <string>
+#include <vector>
+
+#include "NvInfer.h"
+#include "NvInferPlugin.h"
+#include "maskRCNNKernels.h"
+
+namespace nvinfer1
+{
+namespace plugin
+{
+
+class ProposalLayer : public IPluginV2Ext
+{
+public:
+    ProposalLayer(int prenms_topk, int keep_topk, float iou_threshold);
+
+    ProposalLayer(const void* data, size_t length);
+
+    ~ProposalLayer() override = default;
+
+    int getNbOutputs() const override;
+
+    Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override;
+
+    int initialize() override;
+
+    void terminate() override;
+
+    void destroy() override;
+
+    size_t getWorkspaceSize(int maxBatchSize) const override;
+
+    int enqueue(
+        int batch_size, const void* const* inputs, void** outputs, void* workspace, cudaStream_t stream) override;
+
+    size_t getSerializationSize() const override;
+
+    void serialize(void* buffer) const override;
+
+    // void configureWithFormat(const Dims* inputs, int nbInputs, const Dims* outputDims, int nbOutputs,
+    // nvinfer1::DataType type, nvinfer1::PluginFormat format, int maxBatchSize) override;
+
+    bool supportsFormat(DataType type, PluginFormat format) const override;
+
+    const char* getPluginType() const override;
+
+    const char* getPluginVersion() const override;
+
+    IPluginV2Ext* clone() const override;
+
+    void setPluginNamespace(const char* libNamespace) override;
+
+    const char* getPluginNamespace() const override;
+
+    DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const override;
+
+    bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const override;
+
+    bool canBroadcastInputAcrossBatch(int inputIndex) const override;
+
+    void attachToContext(
+        cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) override;
+
+    void configurePlugin(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs,
+        const DataType* inputTypes, const DataType* outputTypes, const bool* inputIsBroadcast,
+        const bool* outputIsBroadcast, PluginFormat floatFormat, int maxBatchSize) override;
+
+    void detachFromContext() override;
+
+private:
+    void check_valid_inputs(const nvinfer1::Dims* inputs, int nbInputDims);
+    void generate_pyramid_anchors();
+
+    int mBackgroundLabel;
+    int mPreNMSTopK;
+    int mKeepTopK;
+    float mScoreThreshold;
+    float mIOUThreshold;
+
+    int mMaxBatchSize;
+    int mAnchorsCnt;
+    std::shared_ptr<CudaBind<int>> mValidCnt; // valid cnt = number of input roi for every image.
+    std::shared_ptr<CudaBind<float>>
+        mAnchorBoxesDevice; // [N, anchors(261888 for resnet101 + 1024*1024), (y1, x1, y2, x2)]
+    std::vector<float> mAnchorBoxesHost;
+
+    nvinfer1::DataType mType;
+    RefineNMSParameters mParam;
+
+    std::string mNameSpace;
+};
+
+class ProposalLayerPluginCreator : public BaseCreator
+{
+public:
+    ProposalLayerPluginCreator();
+
+    ~ProposalLayerPluginCreator(){};
+
+    const char* getPluginName() const override;
+
+    const char* getPluginVersion() const override;
+
+    const PluginFieldCollection* getFieldNames() override;
+
+    IPluginV2Ext* createPlugin(const char* name, const PluginFieldCollection* fc) override;
+
+    IPluginV2Ext* deserializePlugin(const char* name, const void* data, size_t length) override;
+
+private:
+    static PluginFieldCollection mFC;
+    int mPreNMSTopK;
+    int mKeepTopK;
+    float mScoreThreshold;
+    float mIOUThreshold;
+    static std::vector<PluginField> mPluginAttributes;
+};
+} // namespace plugin
+} // namespace nvinfer1
+#endif // TRT_PROPOSAL_LAYER_PLUGIN_H
diff --git a/plugin/proposalPlugin/README.md b/plugin/proposalPlugin/README.md
index 48124fa7..4cacd6b6 100644
--- a/plugin/proposalPlugin/README.md
+++ b/plugin/proposalPlugin/README.md
@@ -11,7 +11,7 @@
 
 ## Description
 
-The `proposalPlugin` performs object detection for the Faster R-CNN model.
+The `proposalPlugin` performs object detection for the Faster R-CNN model. This plugin is included in TensorRT and used in [sampleUffFasterRCNN] to perform inference.
 
 `proposalPlugin` decodes predicted bounding boxes, extracts their corresponding objectness score, extracts region of interest from predicted bounding boxes using non maximum suppression, for downstreaming ROIPooling tasks.
 
@@ -82,4 +82,4 @@ This is the first release of this `README.md` file.
 
 ## Known issues
 
-There are no known issues in this plugin.
+There are no known issues in this plugin.
\ No newline at end of file
diff --git a/plugin/proposalPlugin/proposalPlugin.cpp b/plugin/proposalPlugin/proposalPlugin.cpp
index d307d233..c552abcc 100644
--- a/plugin/proposalPlugin/proposalPlugin.cpp
+++ b/plugin/proposalPlugin/proposalPlugin.cpp
@@ -32,7 +32,7 @@ namespace
 static const char* PROPOSAL_PLUGIN_VERSION{"1"};
 static const char* PROPOSAL_PLUGIN_NAME{"Proposal"};
 static const float RPN_STD_SCALING{1.0f};
-}
+} // namespace
 
 // Static class fields initialization
 PluginFieldCollection ProposalPluginCreator::mFC{};
@@ -145,9 +145,7 @@ ProposalPlugin::ProposalPlugin(const std::string name, const void* serial_buf, s
     ASSERT(a == d + serial_size);
 }
 
-ProposalPlugin::~ProposalPlugin()
-{
-}
+ProposalPlugin::~ProposalPlugin() {}
 
 const char* ProposalPlugin::getPluginType() const
 {
@@ -247,9 +245,7 @@ bool ProposalPlugin::supportsFormat(DataType type, PluginFormat format) const
     }
 }
 
-void ProposalPlugin::terminate()
-{
-}
+void ProposalPlugin::terminate() {}
 
 void ProposalPlugin::destroy()
 {
@@ -316,9 +312,7 @@ void ProposalPlugin::attachToContext(
 }
 
 // Detach the plugin object from its execution context.
-void ProposalPlugin::detachFromContext()
-{
-}
+void ProposalPlugin::detachFromContext() {}
 
 ProposalPluginCreator::ProposalPluginCreator()
 {
@@ -335,9 +329,7 @@ ProposalPluginCreator::ProposalPluginCreator()
     mFC.fields = mPluginAttributes.data();
 }
 
-ProposalPluginCreator::~ProposalPluginCreator()
-{
-}
+ProposalPluginCreator::~ProposalPluginCreator() {}
 
 const char* ProposalPluginCreator::getPluginName() const
 {
@@ -429,7 +421,7 @@ IPluginV2Ext* ProposalPluginCreator::createPlugin(const char* name, const Plugin
         }
     }
 
-    ASSERT(input_height > 0 && input_width > 0 && rpn_stride > 0 && pre_nms_top_n > 0 && post_nms_top_n 
+    ASSERT(input_height > 0 && input_width > 0 && rpn_stride > 0 && pre_nms_top_n > 0 && post_nms_top_n
         && roi_min_size >= 0.0f && nms_iou_threshold > 0.0f);
 
     IPluginV2Ext* plugin = new ProposalPlugin(name, input_height, input_width, RPN_STD_SCALING, rpn_stride,
diff --git a/plugin/pyramidROIAlignPlugin/CMakeLists.txt b/plugin/pyramidROIAlignPlugin/CMakeLists.txt
new file mode 100644
index 00000000..1dbe788c
--- /dev/null
+++ b/plugin/pyramidROIAlignPlugin/CMakeLists.txt
@@ -0,0 +1,18 @@
+#
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+file(GLOB SRCS *.cpp *.cu)
+set(PLUGIN_SOURCES ${PLUGIN_SOURCES} ${SRCS})
+set(PLUGIN_SOURCES ${PLUGIN_SOURCES} PARENT_SCOPE)
diff --git a/plugin/pyramidROIAlignPlugin/README.md b/plugin/pyramidROIAlignPlugin/README.md
new file mode 100644
index 00000000..b1287765
--- /dev/null
+++ b/plugin/pyramidROIAlignPlugin/README.md
@@ -0,0 +1,62 @@
+# PyramidROIAlign
+
+**Table Of Contents**
+- [Description](#description)
+    * [Structure](#structure)
+- [Parameters](#parameters)
+- [Additional resources](#additional-resources)
+- [License](#license)
+- [Changelog](#changelog)
+- [Known issues](#known-issues)
+
+## Description
+
+The `PyramidROIAlign` plugin performs ROIAlign operation on the output feature maps from FPN(Feature Pyramid Network). It is used in sampleMaskRCNN. 
+
+
+### Structure
+
+This plugin supports the NCHW format. It takes mutiple input: `roi`, `feature_maps` from FPN.
+
+`roi` is the ROI candidates from `ProposalLayer`. Its shape is `[N, rois, 4]` where `N` is the batch_size, `rois` is the number of ROI candidates and `4` is the number of
+coordinates.
+
+`feature_maps` are the output of FPN. In sample MaskRCNN, the model we provide contains 4 feature maps from FPN's different stages.
+
+This plugin generate one output tensor of shape `[N, rois, C, pooled_size, pooled_size]` where `C` is the channel of mutiple feature maps from FPN and `pooled_size` is the
+height(and width) of the feature area after ROIAlign.
+
+## Parameters
+
+This plugin has the plugin creator class `PyramidROIAlignPluginCreator` and the plugin class `PyramidROIAlign`.
+  
+The following parameters were used to create `PyramidROIAlign` instance:
+
+| Type             | Parameter                       | Description
+|------------------|---------------------------------|--------------------------------------------------------
+|`int`             |`pooled_size`                    | The spatial size of a feature area after ROIAlgin will be `[pooled_size, pooled_size]`  
+
+
+## Additional resources
+
+The following resources provide a deeper understanding of the `PyramidROIAlign` plugin:
+
+- [MaskRCNN](https://github.com/matterport/Mask_RCNN)
+- [FPN](https://arxiv.org/abs/1612.03144)
+
+
+## License
+
+For terms and conditions for use, reproduction, and distribution, see the [TensorRT Software License Agreement](https://docs.nvidia.com/deeplearning/sdk/tensorrt-sla/index.html) 
+documentation.
+
+
+## Changelog
+
+June 2019
+This is the first release of this `README.md` file.
+
+
+## Known issues
+
+There are no known issues in this plugin.
diff --git a/plugin/pyramidROIAlignPlugin/pyramidROIAlignPlugin.cpp b/plugin/pyramidROIAlignPlugin/pyramidROIAlignPlugin.cpp
new file mode 100644
index 00000000..accdad50
--- /dev/null
+++ b/plugin/pyramidROIAlignPlugin/pyramidROIAlignPlugin.cpp
@@ -0,0 +1,289 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "pyramidROIAlignPlugin.h"
+#include "plugin.h"
+#include <cuda_runtime_api.h>
+
+using namespace nvinfer1;
+using namespace plugin;
+using nvinfer1::plugin::PyramidROIAlign;
+using nvinfer1::plugin::PyramidROIAlignPluginCreator;
+
+namespace
+{
+const char* PYRAMIDROIALGIN_PLUGIN_VERSION{"1"};
+const char* PYRAMIDROIALGIN_PLUGIN_NAME{"PyramidROIAlign_TRT"};
+} // namespace
+
+PluginFieldCollection PyramidROIAlignPluginCreator::mFC{};
+std::vector<PluginField> PyramidROIAlignPluginCreator::mPluginAttributes;
+
+PyramidROIAlignPluginCreator::PyramidROIAlignPluginCreator()
+{
+    mPluginAttributes.emplace_back(PluginField("pooled_size", nullptr, PluginFieldType::kINT32, 1));
+
+    mFC.nbFields = mPluginAttributes.size();
+    mFC.fields = mPluginAttributes.data();
+}
+
+const char* PyramidROIAlignPluginCreator::getPluginName() const
+{
+    return PYRAMIDROIALGIN_PLUGIN_NAME;
+};
+
+const char* PyramidROIAlignPluginCreator::getPluginVersion() const
+{
+    return PYRAMIDROIALGIN_PLUGIN_VERSION;
+};
+
+const PluginFieldCollection* PyramidROIAlignPluginCreator::getFieldNames()
+{
+    return &mFC;
+};
+
+IPluginV2Ext* PyramidROIAlignPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc)
+{
+    const PluginField* fields = fc->fields;
+    for (int i = 0; i < fc->nbFields; ++i)
+    {
+        const char* attrName = fields[i].name;
+        if (!strcmp(attrName, "pooled_size"))
+        {
+            assert(fields[i].type == PluginFieldType::kINT32);
+            mPooledSize = *(static_cast<const int*>(fields[i].data));
+        }
+    }
+    return new PyramidROIAlign(mPooledSize);
+};
+
+IPluginV2Ext* PyramidROIAlignPluginCreator::deserializePlugin(const char* name, const void* data, size_t length)
+{
+    return new PyramidROIAlign(data, length);
+};
+
+PyramidROIAlign::PyramidROIAlign(int pooled_size)
+    : mPooledSize({pooled_size, pooled_size})
+{
+
+    assert(pooled_size > 0);
+    // shape
+    mInputSize = MaskRCNNConfig::IMAGE_SHAPE.d[1];
+    mThresh = (224 * 224 * 2.0f / (mInputSize * mInputSize)) / (4.0 * 4.0f);
+};
+
+int PyramidROIAlign::getNbOutputs() const
+{
+    return 1;
+};
+
+int PyramidROIAlign::initialize()
+{
+    return 0;
+};
+
+void PyramidROIAlign::terminate(){
+
+};
+
+void PyramidROIAlign::destroy()
+{
+    delete this;
+};
+
+size_t PyramidROIAlign::getWorkspaceSize(int) const
+{
+    return 0;
+}
+
+bool PyramidROIAlign::supportsFormat(DataType type, PluginFormat format) const
+{
+    return (type == DataType::kFLOAT && format == PluginFormat::kNCHW);
+};
+
+const char* PyramidROIAlign::getPluginType() const
+{
+    return "PyramidROIAlign_TRT";
+};
+
+const char* PyramidROIAlign::getPluginVersion() const
+{
+    return "1";
+};
+
+IPluginV2Ext* PyramidROIAlign::clone() const
+{
+    return new PyramidROIAlign(*this);
+};
+
+void PyramidROIAlign::setPluginNamespace(const char* libNamespace)
+{
+    mNameSpace = libNamespace;
+};
+
+const char* PyramidROIAlign::getPluginNamespace() const
+{
+    return mNameSpace.c_str();
+}
+
+void PyramidROIAlign::check_valid_inputs(const nvinfer1::Dims* inputs, int nbInputDims)
+{
+    // to be compatible with tensorflow node's input:
+    // roi: [N, anchors, 4],
+    // feature_map list(4 maps): p2, p3, p4, p5
+    assert(nbInputDims == 1 + mFeatureMapCount);
+
+    nvinfer1::Dims rois = inputs[0];
+    assert(rois.nbDims == 2);
+    assert(rois.d[1] == 4);
+
+    for (int i = 1; i < nbInputDims; ++i)
+    {
+        nvinfer1::Dims dims = inputs[i];
+
+        // CHW with the same #C
+        assert(dims.nbDims == 3 && dims.d[0] == inputs[i].d[0]);
+    }
+}
+
+Dims PyramidROIAlign::getOutputDimensions(int index, const Dims* inputs, int nbInputDims)
+{
+
+    check_valid_inputs(inputs, nbInputDims);
+    assert(index == 0);
+
+    nvinfer1::Dims result;
+    result.nbDims = 4;
+
+    // mROICount
+    result.d[0] = inputs[0].d[0];
+    // mFeatureLength
+    result.d[1] = inputs[1].d[0];
+    // height
+    result.d[2] = mPooledSize.y;
+    // width
+    result.d[3] = mPooledSize.x;
+
+    return result;
+};
+
+int PyramidROIAlign::enqueue(
+    int batch_size, const void* const* inputs, void** outputs, void* workspace, cudaStream_t stream)
+{
+
+    void* pooled = outputs[0];
+
+    cudaError_t status = roiAlign(stream, batch_size, mFeatureLength, mROICount, mThresh,
+
+        inputs[0], &inputs[1], mFeatureSpatialSize,
+
+        pooled, mPooledSize);
+
+    assert(status == cudaSuccess);
+    return 0;
+};
+
+size_t PyramidROIAlign::getSerializationSize() const
+{
+    return sizeof(int) * 2 + sizeof(int) * 3 + sizeof(float) + sizeof(int) * 2 * 4;
+};
+
+void PyramidROIAlign::serialize(void* buffer) const
+{
+    char *d = reinterpret_cast<char*>(buffer), *a = d;
+    write(d, mPooledSize.y);
+    write(d, mPooledSize.x);
+    write(d, mFeatureLength);
+    write(d, mROICount);
+    write(d, mInputSize);
+    write(d, mThresh);
+    write(d, mFeatureSpatialSize[0].y);
+    write(d, mFeatureSpatialSize[0].x);
+    write(d, mFeatureSpatialSize[1].y);
+    write(d, mFeatureSpatialSize[1].x);
+    write(d, mFeatureSpatialSize[2].y);
+    write(d, mFeatureSpatialSize[2].x);
+    write(d, mFeatureSpatialSize[3].y);
+    write(d, mFeatureSpatialSize[3].x);
+    assert(d == a + getSerializationSize());
+};
+
+PyramidROIAlign::PyramidROIAlign(const void* data, size_t length)
+{
+    const char *d = reinterpret_cast<const char*>(data), *a = d;
+    mPooledSize = {read<int>(d), read<int>(d)};
+    mFeatureLength = read<int>(d);
+    mROICount = read<int>(d);
+    mInputSize = read<int>(d);
+    mThresh = read<float>(d);
+    mFeatureSpatialSize[0].y = read<int>(d);
+    mFeatureSpatialSize[0].x = read<int>(d);
+    mFeatureSpatialSize[1].y = read<int>(d);
+    mFeatureSpatialSize[1].x = read<int>(d);
+    mFeatureSpatialSize[2].y = read<int>(d);
+    mFeatureSpatialSize[2].x = read<int>(d);
+    mFeatureSpatialSize[3].y = read<int>(d);
+    mFeatureSpatialSize[3].x = read<int>(d);
+
+    assert(d == a + length);
+};
+
+// Return the DataType of the plugin output at the requested index
+DataType PyramidROIAlign::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const
+{
+    // Only DataType::kFLOAT is acceptable by the plugin layer
+    return DataType::kFLOAT;
+}
+
+// Return true if output tensor is broadcast across a batch.
+bool PyramidROIAlign::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const
+{
+    return false;
+}
+
+// Return true if plugin can use input that is broadcast across batch without replication.
+bool PyramidROIAlign::canBroadcastInputAcrossBatch(int inputIndex) const
+{
+    return false;
+}
+
+// Configure the layer with input and output data types.
+void PyramidROIAlign::configurePlugin(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs,
+    const DataType* inputTypes, const DataType* outputTypes, const bool* inputIsBroadcast,
+    const bool* outputIsBroadcast, PluginFormat floatFormat, int maxBatchSize)
+{
+    assert(supportsFormat(inputTypes[0], floatFormat));
+    check_valid_inputs(inputDims, nbInputs);
+
+    assert(nbOutputs == 1);
+    assert(nbInputs == 1 + mFeatureMapCount);
+
+    mROICount = inputDims[0].d[0];
+    mFeatureLength = inputDims[1].d[0];
+
+    for (size_t layer = 0; layer < mFeatureMapCount; ++layer)
+    {
+        mFeatureSpatialSize[layer] = {inputDims[layer + 1].d[1], inputDims[layer + 1].d[2]};
+    }
+}
+
+// Attach the plugin object to an execution context and grant the plugin the access to some context resource.
+void PyramidROIAlign::attachToContext(
+    cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator)
+{
+}
+
+// Detach the plugin object from its execution context.
+void PyramidROIAlign::detachFromContext() {}
diff --git a/plugin/pyramidROIAlignPlugin/pyramidROIAlignPlugin.h b/plugin/pyramidROIAlignPlugin/pyramidROIAlignPlugin.h
new file mode 100644
index 00000000..22ee321f
--- /dev/null
+++ b/plugin/pyramidROIAlignPlugin/pyramidROIAlignPlugin.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef TRT_PYRAMID_ROIALIGN_PLUGIN_H
+#define TRT_PYRAMID_ROIALIGN_PLUGIN_H
+
+#include <cassert>
+#include <cuda_runtime_api.h>
+#include <string.h>
+#include <string>
+#include <vector>
+
+#include "NvInfer.h"
+#include "NvInferPlugin.h"
+#include "maskRCNNKernels.h"
+#include "mrcnn_config.h"
+
+namespace nvinfer1
+{
+namespace plugin
+{
+
+class PyramidROIAlign : public IPluginV2Ext
+{
+public:
+    PyramidROIAlign(int pooled_size);
+
+    PyramidROIAlign(const void* data, size_t length);
+
+    ~PyramidROIAlign() override = default;
+
+    int getNbOutputs() const override;
+
+    Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override;
+
+    int initialize() override;
+
+    void terminate() override;
+
+    void destroy() override;
+
+    size_t getWorkspaceSize(int) const override;
+
+    int enqueue(
+        int batch_size, const void* const* inputs, void** outputs, void* workspace, cudaStream_t stream) override;
+
+    size_t getSerializationSize() const override;
+
+    void serialize(void* buffer) const override;
+
+    bool supportsFormat(DataType type, PluginFormat format) const override;
+
+    const char* getPluginType() const override;
+
+    const char* getPluginVersion() const override;
+
+    IPluginV2Ext* clone() const override;
+
+    void setPluginNamespace(const char* libNamespace) override;
+
+    const char* getPluginNamespace() const override;
+
+    DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const override;
+
+    bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const override;
+
+    bool canBroadcastInputAcrossBatch(int inputIndex) const override;
+
+    void attachToContext(
+        cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) override;
+
+    void configurePlugin(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs,
+        const DataType* inputTypes, const DataType* outputTypes, const bool* inputIsBroadcast,
+        const bool* outputIsBroadcast, PluginFormat floatFormat, int maxBatchSize) override;
+
+    void detachFromContext() override;
+
+private:
+    void check_valid_inputs(const nvinfer1::Dims* inputs, int nbInputDims);
+
+    xy_t mPooledSize;
+    static const int mFeatureMapCount = 4;
+    int mFeatureLength;
+    int mROICount;
+    int mInputSize;
+    float mThresh;
+    xy_t mFeatureSpatialSize[mFeatureMapCount];
+    std::string mNameSpace;
+};
+
+class PyramidROIAlignPluginCreator : public BaseCreator
+{
+public:
+    PyramidROIAlignPluginCreator();
+
+    ~PyramidROIAlignPluginCreator(){};
+
+    const char* getPluginName() const override;
+
+    const char* getPluginVersion() const override;
+
+    const PluginFieldCollection* getFieldNames() override;
+
+    IPluginV2Ext* createPlugin(const char* name, const PluginFieldCollection* fc) override;
+
+    IPluginV2Ext* deserializePlugin(const char* name, const void* data, size_t length) override;
+
+private:
+    static PluginFieldCollection mFC;
+    int mPooledSize;
+    static std::vector<PluginField> mPluginAttributes;
+};
+} // namespace plugin
+} // namespace nvinfer1
+#endif // TRT_PYRAMID_ROIALIGN_PLUGIN_H
diff --git a/plugin/resizeNearestPlugin/CMakeLists.txt b/plugin/resizeNearestPlugin/CMakeLists.txt
new file mode 100644
index 00000000..1dbe788c
--- /dev/null
+++ b/plugin/resizeNearestPlugin/CMakeLists.txt
@@ -0,0 +1,18 @@
+#
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+file(GLOB SRCS *.cpp *.cu)
+set(PLUGIN_SOURCES ${PLUGIN_SOURCES} ${SRCS})
+set(PLUGIN_SOURCES ${PLUGIN_SOURCES} PARENT_SCOPE)
diff --git a/plugin/resizeNearestPlugin/README.md b/plugin/resizeNearestPlugin/README.md
new file mode 100644
index 00000000..b8d70df0
--- /dev/null
+++ b/plugin/resizeNearestPlugin/README.md
@@ -0,0 +1,57 @@
+# ResizeNearest
+
+**Table Of Contents**
+- [Description](#description)
+    * [Structure](#structure)
+- [Parameters](#parameters)
+- [Additional resources](#additional-resources)
+- [License](#license)
+- [Changelog](#changelog)
+- [Known issues](#known-issues)
+
+## Description
+
+The `ResizeNearest` plugin performs nearest neighbor interpolation among feature map. It is used in sample MaskRCNN. 
+
+
+### Structure
+
+This plugin supports the NCHW format. It takes one input tensor `feature_map`
+
+`feature_map` can be arbitrary feature map from convolution layer of shape `[N, C, H, W]` 
+
+`Resizenearest` generates the resized feature map according to scale factor. For example, if input feature is of `[N, C, H, W]`and`scale=2.0`, then the output feature will be of `[N, C, 2.0 * H, 2.0 * W]`
+
+## Parameters
+
+This plugin has the plugin creator class `ResizeNearestPluginCreator` and the plugin class `ResizeNearest`.
+  
+The following parameters were used to create `ResizeNearest` instance:
+
+| Type               | Parameter                      | Description
+|--------------------|--------------------------------|--------------------------------------------------------
+|`float`             |`scale`                         | Scale factor for resize
+
+
+## Additional resources
+
+The following resources provide a deeper understanding of the `ResizeNearest` plugin:
+
+- [MaskRCNN](https://github.com/matterport/Mask_RCNN)
+
+
+## License
+
+For terms and conditions for use, reproduction, and distribution, see the [TensorRT Software License Agreement](https://docs.nvidia.com/deeplearning/sdk/tensorrt-sla/index.html) 
+documentation.
+
+
+## Changelog
+
+June 2019
+This is the first release of this `README.md` file.
+
+
+## Known issues
+
+There are no known issues in this plugin.
diff --git a/plugin/resizeNearestPlugin/resizeNearestPlugin.cpp b/plugin/resizeNearestPlugin/resizeNearestPlugin.cpp
new file mode 100644
index 00000000..e7733a71
--- /dev/null
+++ b/plugin/resizeNearestPlugin/resizeNearestPlugin.cpp
@@ -0,0 +1,255 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "resizeNearestPlugin.h"
+#include "plugin.h"
+#include <cuda_runtime_api.h>
+#include <iostream>
+
+#define DEBUG 0
+
+using namespace nvinfer1;
+using namespace plugin;
+using nvinfer1::plugin::ResizeNearest;
+using nvinfer1::plugin::ResizeNearestPluginCreator;
+
+namespace
+{
+const char* RESIZE_PLUGIN_VERSION{"1"};
+const char* RESIZE_PLUGIN_NAME{"ResizeNearest_TRT"};
+} // namespace
+
+PluginFieldCollection ResizeNearestPluginCreator::mFC{};
+std::vector<PluginField> ResizeNearestPluginCreator::mPluginAttributes;
+
+ResizeNearestPluginCreator::ResizeNearestPluginCreator()
+{
+    mPluginAttributes.emplace_back(PluginField("scale", nullptr, PluginFieldType::kFLOAT32, 1));
+
+    mFC.nbFields = mPluginAttributes.size();
+    mFC.fields = mPluginAttributes.data();
+}
+
+const char* ResizeNearestPluginCreator::getPluginName() const
+{
+    return RESIZE_PLUGIN_NAME;
+};
+
+const char* ResizeNearestPluginCreator::getPluginVersion() const
+{
+    return RESIZE_PLUGIN_VERSION;
+};
+
+const PluginFieldCollection* ResizeNearestPluginCreator::getFieldNames()
+{
+    return &mFC;
+};
+
+IPluginV2Ext* ResizeNearestPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc)
+{
+    const PluginField* fields = fc->fields;
+    for (int i = 0; i < fc->nbFields; ++i)
+    {
+        const char* attrName = fields[i].name;
+        if (!strcmp(attrName, "scale"))
+        {
+            assert(fields[i].type == PluginFieldType::kFLOAT32);
+            mScale = *(static_cast<const float*>(fields[i].data));
+        }
+    }
+    return new ResizeNearest(mScale);
+};
+
+IPluginV2Ext* ResizeNearestPluginCreator::deserializePlugin(const char* name, const void* data, size_t length)
+{
+    return new ResizeNearest(data, length);
+};
+
+ResizeNearest::ResizeNearest(float scale)
+    : mScale(scale)
+{
+    assert(mScale > 0);
+};
+
+int ResizeNearest::getNbOutputs() const
+{
+    return 1;
+};
+
+Dims ResizeNearest::getOutputDimensions(int index, const Dims* inputDims, int nbInputs)
+{
+    assert(nbInputs == 1);
+    nvinfer1::Dims const& input = inputDims[0];
+    assert(index == 0);
+    nvinfer1::Dims output;
+    output.nbDims = input.nbDims;
+    for (int d = 0; d < input.nbDims; ++d)
+    {
+        if (d == input.nbDims - 2 || d == input.nbDims - 1)
+        {
+            output.d[d] = int(input.d[d] * mScale);
+        }
+        else
+        {
+            output.d[d] = input.d[d];
+        }
+    }
+    return output;
+};
+
+int ResizeNearest::initialize()
+{
+    return 0;
+};
+
+void ResizeNearest::terminate(){
+
+};
+
+void ResizeNearest::destroy(){
+
+};
+
+size_t ResizeNearest::getWorkspaceSize(int) const
+{
+    return 0;
+}
+
+size_t ResizeNearest::getSerializationSize() const
+{
+    // scale, dimensions: 3 * 2
+    return sizeof(float) + sizeof(int) * 3 * 2;
+};
+
+void ResizeNearest::serialize(void* buffer) const
+{
+    char *d = reinterpret_cast<char*>(buffer), *a = d;
+    write(d, mScale);
+    write(d, mInputDims.d[0]);
+    write(d, mInputDims.d[1]);
+    write(d, mInputDims.d[2]);
+    write(d, mOutputDims.d[0]);
+    write(d, mOutputDims.d[1]);
+    write(d, mOutputDims.d[2]);
+    ASSERT(d == a + getSerializationSize());
+};
+
+ResizeNearest::ResizeNearest(const void* data, size_t length)
+{
+    const char *d = reinterpret_cast<const char*>(data), *a = d;
+    mScale = read<float>(d);
+    mInputDims = Dims3();
+    mInputDims.d[0] = read<int>(d);
+    mInputDims.d[1] = read<int>(d);
+    mInputDims.d[2] = read<int>(d);
+    mOutputDims = Dims3();
+    mOutputDims.d[0] = read<int>(d);
+    mOutputDims.d[1] = read<int>(d);
+    mOutputDims.d[2] = read<int>(d);
+    ASSERT(d == a + length);
+};
+
+const char* ResizeNearest::getPluginType() const
+{
+    return "ResizeNearest_TRT";
+};
+
+const char* ResizeNearest::getPluginVersion() const
+{
+    return "1";
+};
+
+IPluginV2Ext* ResizeNearest::clone() const
+{
+    return new ResizeNearest(*this);
+};
+
+void ResizeNearest::setPluginNamespace(const char* libNamespace)
+{
+    mNameSpace = libNamespace;
+};
+
+const char* ResizeNearest::getPluginNamespace() const
+{
+    return mNameSpace.c_str();
+}
+
+bool ResizeNearest::supportsFormat(DataType type, PluginFormat format) const
+{
+    return (type == DataType::kFLOAT && format == PluginFormat::kNCHW);
+};
+
+int ResizeNearest::enqueue(
+    int batch_size, const void* const* inputs, void** outputs, void* workspace, cudaStream_t stream)
+{
+
+    int nchan = mOutputDims.d[0];
+    float scale = mScale;
+    int2 osize = {mOutputDims.d[2], mOutputDims.d[1]};
+    int istride = mInputDims.d[2];
+    int ostride = mOutputDims.d[2];
+    int ibatchstride = mInputDims.d[1] * istride;
+    int obatchstride = mOutputDims.d[1] * ostride;
+    dim3 block(32, 16);
+    dim3 grid((osize.x - 1) / block.x + 1, (osize.y - 1) / block.y + 1, std::min(batch_size * nchan, 65535));
+
+    resizeNearest(grid, block, stream, batch_size * nchan, scale, osize, static_cast<float const*>(inputs[0]), istride,
+        ibatchstride, static_cast<float*>(outputs[0]), ostride, obatchstride);
+
+    return cudaGetLastError() != cudaSuccess;
+};
+
+// Return the DataType of the plugin output at the requested index
+DataType ResizeNearest::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const
+{
+    // Only 1 input and 1 output from the plugin layer
+    ASSERT(index == 0);
+
+    // Only DataType::kFLOAT is acceptable by the plugin layer
+    return DataType::kFLOAT;
+}
+
+// Return true if output tensor is broadcast across a batch.
+bool ResizeNearest::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const
+{
+    return false;
+}
+
+// Return true if plugin can use input that is broadcast across batch without replication.
+bool ResizeNearest::canBroadcastInputAcrossBatch(int inputIndex) const
+{
+    return false;
+}
+
+// Configure the layer with input and output data types.
+void ResizeNearest::configurePlugin(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs,
+    const DataType* inputTypes, const DataType* outputTypes, const bool* inputIsBroadcast,
+    const bool* outputIsBroadcast, PluginFormat floatFormat, int maxBatchSize)
+{
+    assert(nbInputs == 1);
+    mInputDims = inputDims[0];
+
+    assert(nbOutputs == 1);
+    mOutputDims = outputDims[0];
+}
+
+// Attach the plugin object to an execution context and grant the plugin the access to some context resource.
+void ResizeNearest::attachToContext(
+    cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator)
+{
+}
+
+// Detach the plugin object from its execution context.
+void ResizeNearest::detachFromContext() {}
diff --git a/plugin/resizeNearestPlugin/resizeNearestPlugin.h b/plugin/resizeNearestPlugin/resizeNearestPlugin.h
new file mode 100644
index 00000000..471ce9a4
--- /dev/null
+++ b/plugin/resizeNearestPlugin/resizeNearestPlugin.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef TRT_RESIZENEAREST_PLUGIN_H
+#define TRT_RESIZENEAREST_PLUGIN_H
+
+#include <cassert>
+#include <cuda_runtime_api.h>
+#include <string.h>
+#include <string>
+#include <vector>
+
+#include "NvInfer.h"
+#include "NvInferPlugin.h"
+#include "maskRCNNKernels.h"
+
+namespace nvinfer1
+{
+namespace plugin
+{
+class ResizeNearest : public IPluginV2Ext
+{
+public:
+    ResizeNearest(float scale);
+
+    ResizeNearest(const void* data, size_t length);
+
+    ~ResizeNearest() override = default;
+
+    int getNbOutputs() const override;
+
+    Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override;
+
+    int initialize() override;
+
+    void terminate() override;
+
+    void destroy() override;
+
+    size_t getWorkspaceSize(int) const override;
+
+    int enqueue(
+        int batch_size, const void* const* inputs, void** outputs, void* workspace, cudaStream_t stream) override;
+
+    size_t getSerializationSize() const override;
+
+    void serialize(void* buffer) const override;
+
+    bool supportsFormat(DataType type, PluginFormat format) const override;
+
+    const char* getPluginType() const override;
+
+    const char* getPluginVersion() const override;
+
+    IPluginV2Ext* clone() const override;
+
+    void setPluginNamespace(const char* libNamespace) override;
+
+    const char* getPluginNamespace() const override;
+
+    DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const override;
+
+    bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const override;
+
+    bool canBroadcastInputAcrossBatch(int inputIndex) const override;
+
+    void attachToContext(
+        cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) override;
+
+    void configurePlugin(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs,
+        const DataType* inputTypes, const DataType* outputTypes, const bool* inputIsBroadcast,
+        const bool* outputIsBroadcast, PluginFormat floatFormat, int maxBatchSize) override;
+
+    void detachFromContext() override;
+
+private:
+    float mScale;
+    Dims mInputDims;
+    Dims mOutputDims;
+    std::string mNameSpace;
+};
+
+class ResizeNearestPluginCreator : public BaseCreator
+{
+public:
+    ResizeNearestPluginCreator();
+
+    ~ResizeNearestPluginCreator(){};
+
+    const char* getPluginName() const override;
+
+    const char* getPluginVersion() const override;
+
+    const PluginFieldCollection* getFieldNames() override;
+
+    IPluginV2Ext* createPlugin(const char* name, const PluginFieldCollection* fc) override;
+
+    IPluginV2Ext* deserializePlugin(const char* name, const void* data, size_t length) override;
+
+private:
+    static PluginFieldCollection mFC;
+    float mScale;
+    static std::vector<PluginField> mPluginAttributes;
+};
+} // namespace plugin
+} // namespace nvinfer1
+#endif // TRT_RESIZENEAREST_PLUGIN_H
diff --git a/plugin/specialSlicePlugin/CMakeLists.txt b/plugin/specialSlicePlugin/CMakeLists.txt
new file mode 100644
index 00000000..1dbe788c
--- /dev/null
+++ b/plugin/specialSlicePlugin/CMakeLists.txt
@@ -0,0 +1,18 @@
+#
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+file(GLOB SRCS *.cpp *.cu)
+set(PLUGIN_SOURCES ${PLUGIN_SOURCES} ${SRCS})
+set(PLUGIN_SOURCES ${PLUGIN_SOURCES} PARENT_SCOPE)
diff --git a/plugin/specialSlicePlugin/README.md b/plugin/specialSlicePlugin/README.md
new file mode 100644
index 00000000..fbf8f6a4
--- /dev/null
+++ b/plugin/specialSlicePlugin/README.md
@@ -0,0 +1,53 @@
+# SpecialSlice 
+
+**Table Of Contents**
+- [Description](#description)
+    * [Structure](#structure)
+- [Parameters](#parameters)
+- [Additional resources](#additional-resources)
+- [License](#license)
+- [Changelog](#changelog)
+- [Known issues](#known-issues)
+
+## Description
+
+The `SpecialSlice` plugin slice the detections of MaskRCNN from `[y1, x1, y2, x2, class_label, score]` to `[y1, x1, y2, x2]`. It is used in sampleMaskRCNN.
+
+
+### Structure
+
+This plugin supports the NCHW format. It takes one input tensor: `detections` 
+
+`detections` is the output of `DetectionLayer` in MaskRCNN model. Its shape is `[N, num_det, 6]` where `N` is the batch size, `num_det` is the number of detections generated from `DetectionLayer` and `6` means 6 elements of a detection `[y1, x1, y2, x2, class_label, score]`.
+
+This plugin generates one output tensor of shape `[N, num_det, 4]`. 
+
+## Parameters
+
+This plugin has the plugin creator class `SpecialSliceCreator` and the plugin class `SpecialSlice`.
+
+This plugin has no parameter.
+  
+
+## Additional resources
+
+The following resources provide a deeper understanding of the `SpecialSlice` plugin:
+
+- [MaskRCNN](https://github.com/matterport/Mask_RCNN)
+
+
+## License
+
+For terms and conditions for use, reproduction, and distribution, see the [TensorRT Software License Agreement](https://docs.nvidia.com/deeplearning/sdk/tensorrt-sla/index.html) 
+documentation.
+
+
+## Changelog
+
+June 2019
+This is the first release of this `README.md` file.
+
+
+## Known issues
+
+There are no known issues in this plugin.
diff --git a/plugin/specialSlicePlugin/specialSlicePlugin.cpp b/plugin/specialSlicePlugin/specialSlicePlugin.cpp
new file mode 100644
index 00000000..45c61096
--- /dev/null
+++ b/plugin/specialSlicePlugin/specialSlicePlugin.cpp
@@ -0,0 +1,209 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "specialSlicePlugin.h"
+#include "maskRCNNKernels.h"
+#include <cuda_runtime_api.h>
+
+using namespace nvinfer1;
+using namespace plugin;
+using nvinfer1::plugin::SpecialSlice;
+using nvinfer1::plugin::SpecialSlicePluginCreator;
+
+namespace
+{
+const char* SPECIALSLICE_PLUGIN_VERSION{"1"};
+const char* SPECIALSLICE_PLUGIN_NAME{"SpecialSlice_TRT"};
+} // namespace
+
+PluginFieldCollection SpecialSlicePluginCreator::mFC{};
+std::vector<PluginField> SpecialSlicePluginCreator::mPluginAttributes;
+
+SpecialSlicePluginCreator::SpecialSlicePluginCreator()
+{
+
+    mFC.nbFields = mPluginAttributes.size();
+    mFC.fields = mPluginAttributes.data();
+}
+
+const char* SpecialSlicePluginCreator::getPluginName() const
+{
+    return SPECIALSLICE_PLUGIN_NAME;
+};
+
+const char* SpecialSlicePluginCreator::getPluginVersion() const
+{
+    return SPECIALSLICE_PLUGIN_VERSION;
+};
+
+const PluginFieldCollection* SpecialSlicePluginCreator::getFieldNames()
+{
+    return &mFC;
+};
+
+IPluginV2Ext* SpecialSlicePluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc)
+{
+    return new SpecialSlice();
+};
+
+IPluginV2Ext* SpecialSlicePluginCreator::deserializePlugin(const char* name, const void* data, size_t length)
+{
+    return new SpecialSlice(data, length);
+};
+
+size_t SpecialSlice::getWorkspaceSize(int) const
+{
+    return 0;
+}
+
+bool SpecialSlice::supportsFormat(DataType type, PluginFormat format) const
+{
+    return (type == DataType::kFLOAT && format == PluginFormat::kNCHW);
+};
+
+const char* SpecialSlice::getPluginType() const
+{
+    return "SpecialSlice_TRT";
+};
+
+const char* SpecialSlice::getPluginVersion() const
+{
+    return "1";
+};
+
+IPluginV2Ext* SpecialSlice::clone() const
+{
+    return new SpecialSlice(*this);
+};
+
+void SpecialSlice::setPluginNamespace(const char* libNamespace)
+{
+    mNameSpace = libNamespace;
+};
+
+const char* SpecialSlice::getPluginNamespace() const
+{
+    return mNameSpace.c_str();
+}
+
+size_t SpecialSlice::getSerializationSize() const
+{
+    return sizeof(int);
+};
+
+void SpecialSlice::serialize(void* buffer) const
+{
+    char *d = reinterpret_cast<char*>(buffer), *a = d;
+    write(d, mBboxesCnt);
+    ASSERT(d == a + getSerializationSize());
+};
+
+SpecialSlice::SpecialSlice(const void* data, size_t length)
+{
+    const char *d = reinterpret_cast<const char*>(data), *a = d;
+    mBboxesCnt = read<int>(d);
+    assert(d == a + length);
+};
+
+SpecialSlice::SpecialSlice(){
+
+};
+
+int SpecialSlice::initialize()
+{
+    return 0;
+};
+
+int SpecialSlice::getNbOutputs() const
+{
+    return 1;
+};
+
+void SpecialSlice::check_valid_inputs(const nvinfer1::Dims* inputs, int nbInputDims)
+{
+
+    assert(nbInputDims == 1);
+    // detections: [N, anchors, (y1, x1, y2, x2, class_id, score)]
+    assert(inputs[0].nbDims == 2 && inputs[0].d[1] == 6);
+}
+
+Dims SpecialSlice::getOutputDimensions(int index, const Dims* inputDims, int nbInputs)
+{
+
+    assert(index == 0);
+    assert(nbInputs == 1);
+    check_valid_inputs(inputDims, nbInputs);
+
+    nvinfer1::Dims output;
+    output.nbDims = inputDims[0].nbDims;
+    // number of anchors
+    output.d[0] = inputDims[0].d[0];
+    //(y1, x1, y2, x2)
+    output.d[1] = 4;
+
+    return output;
+};
+
+int SpecialSlice::enqueue(
+    int batch_size, const void* const* inputs, void** outputs, void* workspace, cudaStream_t stream)
+{
+
+    specialSlice(stream, batch_size, mBboxesCnt, inputs[0], outputs[0]);
+
+    return cudaGetLastError() != cudaSuccess;
+};
+
+// Return the DataType of the plugin output at the requested index
+DataType SpecialSlice::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const
+{
+    // Only 1 input and 1 output from the plugin layer
+    ASSERT(index == 0);
+
+    // Only DataType::kFLOAT is acceptable by the plugin layer
+    return DataType::kFLOAT;
+}
+
+// Return true if output tensor is broadcast across a batch.
+bool SpecialSlice::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const
+{
+    return false;
+}
+
+// Return true if plugin can use input that is broadcast across batch without replication.
+bool SpecialSlice::canBroadcastInputAcrossBatch(int inputIndex) const
+{
+    return false;
+}
+
+// Configure the layer with input and output data types.
+void SpecialSlice::configurePlugin(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs,
+    const DataType* inputTypes, const DataType* outputTypes, const bool* inputIsBroadcast,
+    const bool* outputIsBroadcast, PluginFormat floatFormat, int maxBatchSize)
+{
+    assert(nbInputs == 1);
+
+    assert(nbOutputs == 1);
+
+    mBboxesCnt = inputDims[0].d[0];
+}
+
+// Attach the plugin object to an execution context and grant the plugin the access to some context resource.
+void SpecialSlice::attachToContext(
+    cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator)
+{
+}
+
+// Detach the plugin object from its execution context.
+void SpecialSlice::detachFromContext() {}
diff --git a/plugin/specialSlicePlugin/specialSlicePlugin.h b/plugin/specialSlicePlugin/specialSlicePlugin.h
new file mode 100644
index 00000000..0d35ea9d
--- /dev/null
+++ b/plugin/specialSlicePlugin/specialSlicePlugin.h
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef TRT_SPECIAL_SLICE_PLUGIN_H
+#define TRT_SPECIAL_SLICE_PLUGIN_H
+
+#include <cassert>
+#include <cuda_runtime_api.h>
+#include <string.h>
+#include <string>
+#include <vector>
+
+#include "NvInfer.h"
+#include "NvInferPlugin.h"
+#include "maskRCNNKernels.h"
+
+namespace nvinfer1
+{
+namespace plugin
+{
+class SpecialSlice : public IPluginV2Ext
+{
+public:
+    SpecialSlice();
+
+    SpecialSlice(const void* data, size_t length);
+
+    ~SpecialSlice() override = default;
+
+    int getNbOutputs() const override;
+
+    void check_valid_inputs(const nvinfer1::Dims* inputs, int nbInputDims);
+
+    Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override;
+
+    int initialize() override;
+
+    void terminate() override{};
+
+    void destroy() override{};
+
+    size_t getWorkspaceSize(int) const override;
+
+    int enqueue(
+        int batch_size, const void* const* inputs, void** outputs, void* workspace, cudaStream_t stream) override;
+
+    size_t getSerializationSize() const override;
+
+    void serialize(void* buffer) const override;
+
+    bool supportsFormat(DataType type, PluginFormat format) const override;
+
+    const char* getPluginType() const override;
+
+    const char* getPluginVersion() const override;
+
+    IPluginV2Ext* clone() const override;
+
+    void setPluginNamespace(const char* libNamespace) override;
+
+    const char* getPluginNamespace() const override;
+
+    DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const override;
+
+    bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const override;
+
+    bool canBroadcastInputAcrossBatch(int inputIndex) const override;
+
+    void attachToContext(
+        cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) override;
+
+    void configurePlugin(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs,
+        const DataType* inputTypes, const DataType* outputTypes, const bool* inputIsBroadcast,
+        const bool* outputIsBroadcast, PluginFormat floatFormat, int maxBatchSize) override;
+
+    void detachFromContext() override;
+
+private:
+    int mBboxesCnt;
+    std::string mNameSpace;
+};
+
+class SpecialSlicePluginCreator : public BaseCreator
+{
+public:
+    SpecialSlicePluginCreator();
+
+    ~SpecialSlicePluginCreator() override = default;
+
+    const char* getPluginName() const override;
+
+    const char* getPluginVersion() const override;
+
+    const PluginFieldCollection* getFieldNames() override;
+
+    IPluginV2Ext* createPlugin(const char* name, const PluginFieldCollection* fc) override;
+
+    IPluginV2Ext* deserializePlugin(const char* name, const void* data, size_t length) override;
+
+private:
+    static PluginFieldCollection mFC;
+    static std::vector<PluginField> mPluginAttributes;
+};
+} // namespace plugin
+} // namespace nvinfer1
+#endif // TRT_SPECIAL_SLICE_PLUGIN_H
diff --git a/samples/CMakeSamplesTemplate.txt b/samples/CMakeSamplesTemplate.txt
index 28917a3a..3c28f339 100644
--- a/samples/CMakeSamplesTemplate.txt
+++ b/samples/CMakeSamplesTemplate.txt
@@ -112,8 +112,10 @@ if("uff" IN_LIST SAMPLE_PARSERS)
     list(APPEND SAMPLE_DEP_LIBS nvuffparser)
 endif()
 
+# Necessary to link nvinfer_plugin library.
 target_link_libraries(${TARGET_NAME}
     ${SAMPLE_DEP_LIBS}
+    -Wl,--unresolved-symbols=ignore-in-shared-libs
 )
 
 set_target_properties(${TARGET_NAME} PROPERTIES LINK_FLAGS "-Wl,--exclude-libs,ALL")
diff --git a/samples/README.md b/samples/README.md
index e5037f4c..b0f2f7a9 100644
--- a/samples/README.md
+++ b/samples/README.md
@@ -4,15 +4,15 @@ This guide shows how to cross compile TensorRT samples for AArch64 QNX, Linux an
 
 ### Common Prerequisites
 
-* Install the CUDA cross-platform toolkit for the the corresponding target, and set the environment variable `CUDA_INSTALL_DIR` 
+* Install the CUDA cross-platform toolkit for the the corresponding target, and set the environment variable `CUDA_INSTALL_DIR`
 
   ```shell
   export CUDA_INSTALL_DIR="your cuda install dir"
   ```
 
-  `CUDA_INSTALL_DIR` is set to `/usr/local/cuda` by default. 
+  `CUDA_INSTALL_DIR` is set to `/usr/local/cuda` by default.
 
-* Install the cuDNN cross-platform libraries for the corresponding target, and set the environment variable `CUDNN_INSTALL_DIR` 
+* Install the cuDNN cross-platform libraries for the corresponding target, and set the environment variable `CUDNN_INSTALL_DIR`
 
   ```shell
   export CUDNN_INSTALL_DIR="your cudnn install dir"
@@ -20,7 +20,7 @@ This guide shows how to cross compile TensorRT samples for AArch64 QNX, Linux an
 
   `CUDNN_INSTALL_DIR` is set to `CUDA_INSTALL_DIR` by default.
 
-* Install the TensorRT cross compilation debian packages for the corresponding target. 
+* Install the TensorRT cross compilation debian packages for the corresponding target.
   * QNX AArch64: libnvinfer-dev-cross-qnx, libnvinfer5-cross-qnx
   * Linux AArch64: libnvinfer-dev-cross-aarch64, libnvinfer5-cross-aarch64
   * Android AArch64: No debian packages are available.
@@ -76,4 +76,4 @@ Build samples via
 ```shell
 cd /path/to/TensorRT/samples
 make TARGET=android64 ANDROID_CC=/path/to/my-toolchain/bin/aarch64-linux-android-clang++
-```
\ No newline at end of file
+```
diff --git a/samples/common/BatchStream.h b/samples/common/BatchStream.h
index 53ea3f7c..c3c72447 100644
--- a/samples/common/BatchStream.h
+++ b/samples/common/BatchStream.h
@@ -23,7 +23,134 @@
 #include <stdio.h>
 #include <vector>
 
-class BatchStream
+class IBatchStream
+{
+public:
+    virtual void reset(int firstBatch) = 0;
+    virtual bool next() = 0;
+    virtual void skip(int skipCount) = 0;
+    virtual float* getBatch() = 0;
+    virtual float* getLabels() = 0;
+    virtual int getBatchesRead() const = 0;
+    virtual int getBatchSize() const = 0;
+    virtual nvinfer1::Dims getDims() const = 0;
+};
+
+class MNISTBatchStream : public IBatchStream
+{
+public:
+    MNISTBatchStream(int batchSize, int maxBatches, const std::string& dataFile, const std::string& labelsFile,
+        const std::vector<std::string>& directories)
+        : mBatchSize{batchSize}
+        , mMaxBatches{maxBatches}
+        , mDims{3, 1, 28, 28} //!< We already know the dimensions of MNIST images.
+    {
+        readDataFile(locateFile(dataFile, directories));
+        readLabelsFile(locateFile(labelsFile, directories));
+    }
+
+    void reset(int firstBatch) override
+    {
+        mBatchCount = firstBatch;
+    }
+
+    bool next() override
+    {
+        if (mBatchCount >= mMaxBatches)
+        {
+            return false;
+        }
+        ++mBatchCount;
+        return true;
+    }
+
+    void skip(int skipCount) override
+    {
+        mBatchCount += skipCount;
+    }
+
+    float* getBatch() override
+    {
+        return mData.data() + (mBatchCount * mBatchSize * samplesCommon::volume(mDims));
+    }
+
+    float* getLabels() override
+    {
+        return mLabels.data() + (mBatchCount * mBatchSize);
+    }
+
+    int getBatchesRead() const override
+    {
+        return mBatchCount;
+    }
+
+    int getBatchSize() const override
+    {
+        return mBatchSize;
+    }
+
+    nvinfer1::Dims getDims() const override
+    {
+        return mDims;
+    }
+
+private:
+    void readDataFile(const std::string& dataFilePath)
+    {
+        std::ifstream file{dataFilePath.c_str(), std::ios::binary};
+
+        int magicNumber, numImages, imageH, imageW;
+        file.read(reinterpret_cast<char*>(&magicNumber), sizeof(magicNumber));
+        // All values in the MNIST files are big endian.
+        magicNumber = samplesCommon::swapEndianness(magicNumber);
+        assert(magicNumber == 2051 && "Magic Number does not match the expected value for an MNIST image set");
+
+        // Read number of images and dimensions
+        file.read(reinterpret_cast<char*>(&numImages), sizeof(numImages));
+        file.read(reinterpret_cast<char*>(&imageH), sizeof(imageH));
+        file.read(reinterpret_cast<char*>(&imageW), sizeof(imageW));
+
+        numImages = samplesCommon::swapEndianness(numImages);
+        imageH = samplesCommon::swapEndianness(imageH);
+        imageW = samplesCommon::swapEndianness(imageW);
+
+        // The MNIST data is made up of unsigned bytes, so we need to cast to float and normalize.
+        int numElements = numImages * imageH * imageW;
+        std::vector<uint8_t> rawData(numElements);
+        file.read(reinterpret_cast<char*>(rawData.data()), numElements * sizeof(uint8_t));
+        mData.resize(numElements);
+        std::transform(
+            rawData.begin(), rawData.end(), mData.begin(), [](uint8_t val) { return static_cast<float>(val) / 255.f; });
+    }
+
+    void readLabelsFile(const std::string& labelsFilePath)
+    {
+        std::ifstream file{labelsFilePath.c_str(), std::ios::binary};
+        int magicNumber, numImages;
+        file.read(reinterpret_cast<char*>(&magicNumber), sizeof(magicNumber));
+        // All values in the MNIST files are big endian.
+        magicNumber = samplesCommon::swapEndianness(magicNumber);
+        assert(magicNumber == 2049 && "Magic Number does not match the expected value for an MNIST labels file");
+
+        file.read(reinterpret_cast<char*>(&numImages), sizeof(numImages));
+        numImages = samplesCommon::swapEndianness(numImages);
+
+        std::vector<uint8_t> rawLabels(numImages);
+        file.read(reinterpret_cast<char*>(rawLabels.data()), numImages * sizeof(uint8_t));
+        mLabels.resize(numImages);
+        std::transform(
+            rawLabels.begin(), rawLabels.end(), mLabels.begin(), [](uint8_t val) { return static_cast<float>(val); });
+    }
+
+    int mBatchSize{0};
+    int mBatchCount{0}; //!< The batch that will be read on the next invocation of next()
+    int mMaxBatches{0};
+    Dims mDims{};
+    std::vector<float> mData{};
+    std::vector<float> mLabels{};
+};
+
+class BatchStream : public IBatchStream
 {
 public:
     BatchStream(
@@ -77,7 +204,7 @@ class BatchStream
     }
 
     // Resets data members
-    void reset(int firstBatch)
+    void reset(int firstBatch) override
     {
         mBatchCount = 0;
         mFileCount = 0;
@@ -86,7 +213,7 @@ class BatchStream
     }
 
     // Advance to next batch and return true, or return false if there is no batch left.
-    bool next()
+    bool next() override
     {
         if (mBatchCount == mMaxBatches)
         {
@@ -112,7 +239,7 @@ class BatchStream
     }
 
     // Skips the batches
-    void skip(int skipCount)
+    void skip(int skipCount) override
     {
         if (mBatchSize >= mDims.d[0] && mBatchSize % mDims.d[0] == 0 && mFileBatchPos == mDims.d[0])
         {
@@ -128,27 +255,27 @@ class BatchStream
         mBatchCount = x;
     }
 
-    float* getBatch()
+    float* getBatch() override
     {
         return mBatch.data();
     }
 
-    float* getLabels()
+    float* getLabels() override
     {
         return mLabels.data();
     }
 
-    int getBatchesRead() const
+    int getBatchesRead() const override
     {
         return mBatchCount;
     }
 
-    int getBatchSize() const
+    int getBatchSize() const override
     {
         return mBatchSize;
     }
 
-    nvinfer1::Dims getDims() const
+    nvinfer1::Dims getDims() const override
     {
         return mDims;
     }
diff --git a/samples/common/EntropyCalibrator.h b/samples/common/EntropyCalibrator.h
index 9fc78705..efa80081 100644
--- a/samples/common/EntropyCalibrator.h
+++ b/samples/common/EntropyCalibrator.h
@@ -24,18 +24,19 @@
 //!
 //! \brief Implements common functionality for Entropy calibrators.
 //!
+template <typename TBatchStream>
 class EntropyCalibratorImpl
 {
 public:
     EntropyCalibratorImpl(
-        BatchStream& stream, int firstBatch, std::string networkName, const char* inputBlobName, bool readCache = true)
-        : mStream(stream)
+        TBatchStream stream, int firstBatch, std::string networkName, const char* inputBlobName, bool readCache = true)
+        : mStream{stream}
         , mCalibrationTableName("CalibrationTable" + networkName)
         , mInputBlobName(inputBlobName)
         , mReadCache(readCache)
     {
         nvinfer1::Dims dims = mStream.getDims();
-        mInputCount = samplesCommon::volume(dims);
+        mInputCount = samplesCommon::volume(dims) * mStream.getBatchSize();
         CHECK(cudaMalloc(&mDeviceInput, mInputCount * sizeof(float)));
         mStream.reset(firstBatch);
     }
@@ -83,7 +84,7 @@ class EntropyCalibratorImpl
     }
 
 private:
-    BatchStream mStream;
+    TBatchStream mStream;
     size_t mInputCount;
     std::string mCalibrationTableName;
     const char* mInputBlobName;
@@ -97,11 +98,12 @@ class EntropyCalibratorImpl
 //! \brief Implements Entropy calibrator 2.
 //!  CalibrationAlgoType is kENTROPY_CALIBRATION_2.
 //!
+template <typename TBatchStream>
 class Int8EntropyCalibrator2 : public IInt8EntropyCalibrator2
 {
 public:
     Int8EntropyCalibrator2(
-        BatchStream& stream, int firstBatch, const char* networkName, const char* inputBlobName, bool readCache = true)
+        TBatchStream stream, int firstBatch, const char* networkName, const char* inputBlobName, bool readCache = true)
         : mImpl(stream, firstBatch, networkName, inputBlobName, readCache)
     {
     }
@@ -127,7 +129,7 @@ class Int8EntropyCalibrator2 : public IInt8EntropyCalibrator2
     }
 
 private:
-    EntropyCalibratorImpl mImpl;
+    EntropyCalibratorImpl<TBatchStream> mImpl;
 };
 
 #endif // ENTROPY_CALIBRATOR_H
diff --git a/samples/common/ErrorRecorder.h b/samples/common/ErrorRecorder.h
new file mode 100644
index 00000000..16b9e773
--- /dev/null
+++ b/samples/common/ErrorRecorder.h
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ERROR_RECORDER_H
+#define ERROR_RECORDER_H
+#include "NvInferRuntimeCommon.h"
+#include <atomic>
+#include <cstdint>
+#include <exception>
+#include <mutex>
+#include <vector>
+using namespace nvinfer1;
+//!
+//! A simple imeplementation of the IErrorRecorder interface for
+//! use by samples. This interface also can be used as a reference
+//! implementation.
+//! The sample Error recorder is based on a vector that pairs the error
+//! code and the error string into a single element. It also uses
+//! standard mutex's and atomics in order to make sure that the code
+//! works in a multi-threaded environment.
+//! SampleErrorRecorder is not intended for use in automotive safety
+//! environments.
+//!
+class SampleErrorRecorder : public IErrorRecorder
+{
+    using errorPair = std::pair<ErrorCode, std::string>;
+    using errorStack = std::vector<errorPair>;
+
+public:
+    SampleErrorRecorder() = default;
+
+    virtual ~SampleErrorRecorder() noexcept {}
+    int32_t getNbErrors() const noexcept final
+    {
+        return mErrorStack.size();
+    }
+    ErrorCode getErrorCode(int32_t errorIdx) const noexcept final
+    {
+        return indexCheck(errorIdx) ? ErrorCode::kINVALID_ARGUMENT : (*this)[errorIdx].first;
+    };
+    IErrorRecorder::ErrorDesc getErrorDesc(int32_t errorIdx) const noexcept final
+    {
+        return indexCheck(errorIdx) ? "errorIdx out of range." : (*this)[errorIdx].second.c_str();
+    }
+    // This class can never overflow since we have dynamic resize via std::vector usage.
+    bool hasOverflowed() const noexcept final
+    {
+        return false;
+    }
+
+    // Empty the errorStack.
+    void clear() noexcept final
+    {
+        try
+        {
+            // grab a lock so that there is no addition while clearing.
+            std::lock_guard<std::mutex> guard(mStackLock);
+            mErrorStack.clear();
+        }
+        catch (const std::exception& e)
+        {
+            getLogger()->log(ILogger::Severity::kINTERNAL_ERROR, e.what());
+        }
+    };
+
+    //! Simple helper function that
+    bool empty() const noexcept
+    {
+        return mErrorStack.empty();
+    }
+
+    bool reportError(ErrorCode val, IErrorRecorder::ErrorDesc desc) noexcept final
+    {
+        try
+        {
+            std::lock_guard<std::mutex> guard(mStackLock);
+            mErrorStack.push_back(errorPair(val, desc));
+        }
+        catch (const std::exception& e)
+        {
+            getLogger()->log(ILogger::Severity::kINTERNAL_ERROR, e.what());
+        }
+        // All errors are considered fatal.
+        return true;
+    }
+
+    // Atomically increment or decrement the ref counter.
+    IErrorRecorder::RefCount incRefCount() noexcept final
+    {
+        return ++mRefCount;
+    }
+    IErrorRecorder::RefCount decRefCount() noexcept final
+    {
+        return --mRefCount;
+    }
+
+private:
+    // Simple helper functions.
+    const errorPair& operator[](size_t index) const noexcept
+    {
+        return mErrorStack[index];
+    }
+
+    bool indexCheck(int32_t index) const noexcept
+    {
+        // By converting signed to unsigned, we only need a single check since
+        // negative numbers turn into large positive greater than the size.
+        size_t sIndex = index;
+        return sIndex >= mErrorStack.size();
+    }
+    // Mutex to hold when locking mErrorStack.
+    std::mutex mStackLock;
+
+    // Reference count of the class. Destruction of the class when mRefCount
+    // is not zero causes undefined behavior.
+    std::atomic<int32_t> mRefCount{0};
+
+    // The error stack that holds the errors recorded by TensorRT.
+    errorStack mErrorStack;
+};     // class SampleErrorRecorder
+#endif // ERROR_RECORDER_H
diff --git a/samples/common/argsParser.h b/samples/common/argsParser.h
index ff91023e..0d98a40d 100644
--- a/samples/common/argsParser.h
+++ b/samples/common/argsParser.h
@@ -81,6 +81,7 @@ struct Args
     bool runInFp16{false};
     bool help{false};
     int useDLACore{-1};
+    int batch{1};
     std::vector<std::string> dataDirs;
 };
 
@@ -96,9 +97,9 @@ inline bool parseArgs(Args& args, int argc, char* argv[])
     while (1)
     {
         int arg;
-        static struct option long_options[]
-            = {{"help", no_argument, 0, 'h'}, {"datadir", required_argument, 0, 'd'}, {"int8", no_argument, 0, 'i'},
-                {"fp16", no_argument, 0, 'f'}, {"useDLACore", required_argument, 0, 'u'}, {nullptr, 0, nullptr, 0}};
+        static struct option long_options[] = {{"help", no_argument, 0, 'h'}, {"datadir", required_argument, 0, 'd'},
+            {"int8", no_argument, 0, 'i'}, {"fp16", no_argument, 0, 'f'}, {"useDLACore", required_argument, 0, 'u'},
+            {"batch", required_argument, 0, 'b'}, {nullptr, 0, nullptr, 0}};
         int option_index = 0;
         arg = getopt_long(argc, argv, "hd:iu", long_options, &option_index);
         if (arg == -1)
@@ -128,6 +129,12 @@ inline bool parseArgs(Args& args, int argc, char* argv[])
                 args.useDLACore = std::stoi(optarg);
             }
             break;
+        case 'b':
+            if (optarg)
+            {
+                args.batch = std::stoi(optarg);
+            }
+            break;
         default: return false;
         }
     }
diff --git a/samples/common/buffers.h b/samples/common/buffers.h
index 176ed7c9..47abf8ce 100644
--- a/samples/common/buffers.h
+++ b/samples/common/buffers.h
@@ -56,8 +56,10 @@ class GenericBuffer
     //!
     //! \brief Construct an empty buffer.
     //!
-    GenericBuffer()
-        : mByteSize(0)
+    GenericBuffer(nvinfer1::DataType type = nvinfer1::DataType::kFLOAT)
+        : mSize(0)
+        , mCapacity(0)
+        , mType(type)
         , mBuffer(nullptr)
     {
     }
@@ -65,18 +67,26 @@ class GenericBuffer
     //!
     //! \brief Construct a buffer with the specified allocation size in bytes.
     //!
-    GenericBuffer(size_t size)
-        : mByteSize(size)
+    GenericBuffer(size_t size, nvinfer1::DataType type)
+        : mSize(size)
+        , mCapacity(size)
+        , mType(type)
     {
-        if (!allocFn(&mBuffer, mByteSize))
+        if (!allocFn(&mBuffer, this->nbBytes()))
+        {
             throw std::bad_alloc();
+        }
     }
 
     GenericBuffer(GenericBuffer&& buf)
-        : mByteSize(buf.mByteSize)
+        : mSize(buf.mSize)
+        , mCapacity(buf.mCapacity)
+        , mType(buf.mType)
         , mBuffer(buf.mBuffer)
     {
-        buf.mByteSize = 0;
+        buf.mSize = 0;
+        buf.mCapacity = 0;
+        buf.mType = nvinfer1::DataType::kFLOAT;
         buf.mBuffer = nullptr;
     }
 
@@ -85,9 +95,13 @@ class GenericBuffer
         if (this != &buf)
         {
             freeFn(mBuffer);
-            mByteSize = buf.mByteSize;
+            mSize = buf.mSize;
+            mCapacity = buf.mCapacity;
+            mType = buf.mType;
             mBuffer = buf.mBuffer;
-            buf.mByteSize = 0;
+            // Reset buf.
+            buf.mSize = 0;
+            buf.mCapacity = 0;
             buf.mBuffer = nullptr;
         }
         return *this;
@@ -110,11 +124,44 @@ class GenericBuffer
     }
 
     //!
-    //! \brief Returns the size (in bytes) of the buffer.
+    //! \brief Returns the size (in number of elements) of the buffer.
     //!
     size_t size() const
     {
-        return mByteSize;
+        return mSize;
+    }
+
+    //!
+    //! \brief Returns the size (in bytes) of the buffer.
+    //!
+    size_t nbBytes() const
+    {
+        return this->size() * samplesCommon::getElementSize(mType);
+    }
+
+    //!
+    //! \brief Resizes the buffer. This is a no-op if the new size is smaller than or equal to the current capacity.
+    //!
+    void resize(size_t newSize)
+    {
+        mSize = newSize;
+        if (mCapacity < newSize)
+        {
+            freeFn(mBuffer);
+            if (!allocFn(&mBuffer, this->nbBytes()))
+            {
+                throw std::bad_alloc{};
+            }
+            mCapacity = newSize;
+        }
+    }
+
+    //!
+    //! \brief Overload of resize that accepts Dims
+    //!
+    void resize(const nvinfer1::Dims& dims)
+    {
+        return this->resize(samplesCommon::volume(dims));
     }
 
     ~GenericBuffer()
@@ -123,7 +170,8 @@ class GenericBuffer
     }
 
 private:
-    size_t mByteSize;
+    size_t mSize{0}, mCapacity{0};
+    nvinfer1::DataType mType;
     void* mBuffer;
     AllocFunc allocFn;
     FreeFunc freeFn;
@@ -195,19 +243,28 @@ class BufferManager
     //!
     //! \brief Create a BufferManager for handling buffer interactions with engine.
     //!
-    BufferManager(std::shared_ptr<nvinfer1::ICudaEngine> engine, const int& batchSize)
+    BufferManager(std::shared_ptr<nvinfer1::ICudaEngine> engine, const int& batchSize,
+        const nvinfer1::IExecutionContext* context = nullptr)
         : mEngine(engine)
         , mBatchSize(batchSize)
     {
+        // Create host and device buffers
         for (int i = 0; i < mEngine->getNbBindings(); i++)
         {
-            // Create host and device buffers
-            size_t vol = samplesCommon::volume(mEngine->getBindingDimensions(i));
-            size_t elementSize = samplesCommon::getElementSize(mEngine->getBindingDataType(i));
-            size_t allocationSize = static_cast<size_t>(mBatchSize) * vol * elementSize;
+            auto dims = context ? context->getBindingDimensions(i) : mEngine->getBindingDimensions(i);
+            size_t vol = context ? 1 : static_cast<size_t>(mBatchSize);
+            nvinfer1::DataType type = mEngine->getBindingDataType(i);
+            int vecDim = mEngine->getBindingVectorizedDim(i);
+            if (-1 != vecDim) // i.e., 0 != lgScalarsPerVector
+            {
+                int scalarsPerVec = mEngine->getBindingComponentsPerElement(i);
+                dims.d[vecDim] = divUp(dims.d[vecDim], scalarsPerVec);
+                vol *= scalarsPerVec;
+            }
+            vol *= samplesCommon::volume(dims);
             std::unique_ptr<ManagedBuffer> manBuf{new ManagedBuffer()};
-            manBuf->deviceBuffer = DeviceBuffer(allocationSize);
-            manBuf->hostBuffer = HostBuffer(allocationSize);
+            manBuf->deviceBuffer = DeviceBuffer(vol, type);
+            manBuf->hostBuffer = HostBuffer(vol, type);
             mDeviceBindings.emplace_back(manBuf->deviceBuffer.data());
             mManagedBuffers.emplace_back(std::move(manBuf));
         }
@@ -257,7 +314,7 @@ class BufferManager
         int index = mEngine->getBindingIndex(tensorName.c_str());
         if (index == -1)
             return kINVALID_SIZE_VALUE;
-        return mManagedBuffers[index]->hostBuffer.size();
+        return mManagedBuffers[index]->hostBuffer.nbBytes();
     }
 
     //!
@@ -273,7 +330,7 @@ class BufferManager
             return;
         }
         void* buf = mManagedBuffers[index]->hostBuffer.data();
-        size_t bufSize = mManagedBuffers[index]->hostBuffer.size();
+        size_t bufSize = mManagedBuffers[index]->hostBuffer.nbBytes();
         nvinfer1::Dims bufDims = mEngine->getBindingDimensions(index);
         size_t rowCount = static_cast<size_t>(bufDims.nbDims >= 1 ? bufDims.d[bufDims.nbDims - 1] : mBatchSize);
 
@@ -370,7 +427,7 @@ class BufferManager
                 = deviceToHost ? mManagedBuffers[i]->hostBuffer.data() : mManagedBuffers[i]->deviceBuffer.data();
             const void* srcPtr
                 = deviceToHost ? mManagedBuffers[i]->deviceBuffer.data() : mManagedBuffers[i]->hostBuffer.data();
-            const size_t byteSize = mManagedBuffers[i]->hostBuffer.size();
+            const size_t byteSize = mManagedBuffers[i]->hostBuffer.nbBytes();
             const cudaMemcpyKind memcpyType = deviceToHost ? cudaMemcpyDeviceToHost : cudaMemcpyHostToDevice;
             if ((copyInput && mEngine->bindingIsInput(i)) || (!copyInput && !mEngine->bindingIsInput(i)))
             {
diff --git a/samples/common/common.h b/samples/common/common.h
index 3cfa6d1f..f79ab49f 100644
--- a/samples/common/common.h
+++ b/samples/common/common.h
@@ -17,10 +17,18 @@
 #ifndef TENSORRT_COMMON_H
 #define TENSORRT_COMMON_H
 
+// For loadLibrary
+#ifdef _MSC_VER
+// Needed so that the max/min definitions in windows.h do not conflict with std::max/min.
+#define NOMINMAX
+#include <windows.h>
+#undef NOMINMAX
+#else
+#include <dlfcn.h>
+#endif
+
 #include "NvInfer.h"
 #include "NvInferPlugin.h"
-#include "NvOnnxConfig.h"
-#include "NvOnnxParser.h"
 #include "logger.h"
 #include <algorithm>
 #include <cassert>
@@ -41,51 +49,79 @@
 #include <string>
 #include <utility>
 #include <vector>
-#if !defined(_WIN32)
-#include <dlfcn.h>
-#else
-#include <windows.h>
-#endif
 
-using namespace std;
 using namespace nvinfer1;
 using namespace plugin;
 
+#ifdef _MSC_VER
+#define FN_NAME __FUNCTION__
+#else
+#define FN_NAME __func__
+#endif
+
+#if (!defined(__ANDROID__) && defined(__aarch64__)) || defined(__QNX__)
+#define ENABLE_DLA_API 1
+#endif
+
 #define CHECK(status)                                                                                                  \
     do                                                                                                                 \
     {                                                                                                                  \
         auto ret = (status);                                                                                           \
         if (ret != 0)                                                                                                  \
         {                                                                                                              \
-            std::cout << "Cuda failure: " << ret << std::endl;                                                         \
+            std::cerr << "Cuda failure: " << ret << std::endl;                                                         \
             abort();                                                                                                   \
         }                                                                                                              \
     } while (0)
 
-constexpr long double operator"" _GB(long double val)
+#define CHECK_RETURN_W_MSG(status, val, errMsg)                                                                        \
+    do                                                                                                                 \
+    {                                                                                                                  \
+        if (!(status))                                                                                                 \
+        {                                                                                                              \
+            std::cerr << errMsg << " Error in " << __FILE__ << ", function " << FN_NAME << "(), line " << __LINE__     \
+                      << std::endl;                                                                                    \
+            return val;                                                                                                \
+        }                                                                                                              \
+    } while (0)
+
+#define CHECK_RETURN(status, val) CHECK_RETURN_W_MSG(status, val, "")
+
+#define OBJ_GUARD(A) std::unique_ptr<A, void (*)(A * t)>
+
+template <typename T, typename T_>
+OBJ_GUARD(T)
+makeObjGuard(T_* t)
+{
+    CHECK(!(std::is_base_of<T, T_>::value || std::is_same<T, T_>::value));
+    auto deleter = [](T* t) { t->destroy(); };
+    return std::unique_ptr<T, decltype(deleter)>{static_cast<T*>(t), deleter};
+}
+
+constexpr long double operator"" _GiB(long double val)
 {
     return val * (1 << 30);
 }
-constexpr long double operator"" _MB(long double val)
+constexpr long double operator"" _MiB(long double val)
 {
     return val * (1 << 20);
 }
-constexpr long double operator"" _KB(long double val)
+constexpr long double operator"" _KiB(long double val)
 {
     return val * (1 << 10);
 }
 
-// These is necessary if we want to be able to write 1_GB instead of 1.0_GB.
-// Since the return type is signed, -1_GB will work as expected.
-constexpr long long int operator"" _GB(long long unsigned int val)
+// These is necessary if we want to be able to write 1_GiB instead of 1.0_GiB.
+// Since the return type is signed, -1_GiB will work as expected.
+constexpr long long int operator"" _GiB(long long unsigned int val)
 {
     return val * (1 << 30);
 }
-constexpr long long int operator"" _MB(long long unsigned int val)
+constexpr long long int operator"" _MiB(long long unsigned int val)
 {
     return val * (1 << 20);
 }
-constexpr long long int operator"" _KB(long long unsigned int val)
+constexpr long long int operator"" _KiB(long long unsigned int val)
 {
     return val * (1 << 10);
 }
@@ -186,8 +222,14 @@ inline std::string locateFile(const std::string& filepathSuffix, const std::vect
 
     for (auto& dir : directories)
     {
-        if (dir.back() != '/')
+        if (!dir.empty() && dir.back() != '/')
+        {
+#ifdef _MSC_VER
+            filepath = dir + "\\" + filepathSuffix;
+#else
             filepath = dir + "/" + filepathSuffix;
+#endif
+        }
         else
             filepath = dir + filepathSuffix;
 
@@ -232,6 +274,70 @@ inline void readPGMFile(const std::string& fileName, uint8_t* buffer, int inH, i
 namespace samplesCommon
 {
 
+// Swaps endianness of an integral type.
+template <typename T, typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
+inline T swapEndianness(const T& value)
+{
+    uint8_t bytes[sizeof(T)];
+    for (int i = 0; i < static_cast<int>(sizeof(T)); ++i)
+    {
+        bytes[sizeof(T) - 1 - i] = *(reinterpret_cast<const uint8_t*>(&value) + i);
+    }
+    return *reinterpret_cast<T*>(bytes);
+}
+
+class HostMemory : public IHostMemory
+{
+public:
+    HostMemory() = delete;
+    void* data() const noexcept override
+    {
+        return mData;
+    }
+    std::size_t size() const noexcept override
+    {
+        return mSize;
+    }
+    DataType type() const noexcept override
+    {
+        return mType;
+    }
+
+protected:
+    HostMemory(std::size_t size, DataType type)
+        : mSize(size)
+        , mType(type)
+    {
+    }
+    void* mData;
+    std::size_t mSize;
+    DataType mType;
+};
+
+template <typename ElemType, DataType dataType>
+class TypedHostMemory : public HostMemory
+{
+public:
+    TypedHostMemory(std::size_t size)
+        : HostMemory(size, dataType)
+    {
+        mData = new ElemType[size];
+    };
+    void destroy() noexcept override
+    {
+        delete[](ElemType*) mData;
+        delete this;
+    }
+    ElemType* raw() noexcept
+    {
+        return static_cast<ElemType*>(data());
+    }
+};
+
+using FloatMemory = TypedHostMemory<float, DataType::kFLOAT>;
+using HalfMemory = TypedHostMemory<uint16_t, DataType::kHALF>;
+using ByteMemory = TypedHostMemory<uint8_t, DataType::kINT8>;
+
 inline void* safeCudaMalloc(size_t memSize)
 {
     void* deviceMem;
@@ -292,7 +398,7 @@ inline bool readReferenceFile(const std::string& fileName, std::vector<std::stri
     std::ifstream infile(fileName);
     if (!infile.is_open())
     {
-        cout << "ERROR: readReferenceFile: Attempting to read from a file that is not open." << endl;
+        std::cout << "ERROR: readReferenceFile: Attempting to read from a file that is not open." << std::endl;
         return false;
     }
     std::string line;
@@ -308,7 +414,7 @@ inline bool readReferenceFile(const std::string& fileName, std::vector<std::stri
 
 template <typename result_vector_t>
 inline std::vector<std::string> classify(
-    const vector<string>& refVector, const result_vector_t& output, const size_t topK)
+    const std::vector<std::string>& refVector, const result_vector_t& output, const size_t topK)
 {
     auto inds = samplesCommon::argsort(output.cbegin(), output.cend(), true);
     std::vector<std::string> result;
@@ -321,21 +427,21 @@ inline std::vector<std::string> classify(
 
 // Returns top K indices, not values.
 template <typename T>
-inline vector<size_t> topK(const vector<T> inp, const size_t k)
+inline std::vector<size_t> topK(const std::vector<T> inp, const size_t k)
 {
-    vector<size_t> result;
+    std::vector<size_t> result;
     std::vector<size_t> inds = samplesCommon::argsort(inp.cbegin(), inp.cend(), true);
     result.assign(inds.begin(), inds.begin() + k);
     return result;
 }
 
 template <typename T>
-inline bool readASCIIFile(const string& fileName, const size_t size, vector<T>& out)
+inline bool readASCIIFile(const std::string& fileName, const size_t size, std::vector<T>& out)
 {
     std::ifstream infile(fileName);
     if (!infile.is_open())
     {
-        cout << "ERROR readASCIIFile: Attempting to read from a file that is not open." << endl;
+        std::cout << "ERROR readASCIIFile: Attempting to read from a file that is not open." << std::endl;
         return false;
     }
     out.clear();
@@ -346,17 +452,17 @@ inline bool readASCIIFile(const string& fileName, const size_t size, vector<T>&
 }
 
 template <typename T>
-inline bool writeASCIIFile(const string& fileName, const vector<T>& in)
+inline bool writeASCIIFile(const std::string& fileName, const std::vector<T>& in)
 {
     std::ofstream outfile(fileName);
     if (!outfile.is_open())
     {
-        cout << "ERROR: writeASCIIFile: Attempting to write to a file that is not open." << endl;
+        std::cout << "ERROR: writeASCIIFile: Attempting to write to a file that is not open." << std::endl;
         return false;
     }
     for (auto fn : in)
     {
-        outfile << fn << " ";
+        outfile << fn << "\n";
     }
     outfile.close();
     return true;
@@ -364,23 +470,18 @@ inline bool writeASCIIFile(const string& fileName, const vector<T>& in)
 
 inline void print_version()
 {
-// This can be only done after statically linking this support into parserONNX.library
-#if 0
-    std::cout << "Parser built against:" << std::endl;
-    std::cout << "  ONNX IR version:  " << nvonnxparser::onnx_ir_version_string(onnx::IR_VERSION) << std::endl;
-#endif
     std::cout << "  TensorRT version: " << NV_TENSORRT_MAJOR << "." << NV_TENSORRT_MINOR << "." << NV_TENSORRT_PATCH
               << "." << NV_TENSORRT_BUILD << std::endl;
 }
 
-inline string getFileType(const string& filepath)
+inline std::string getFileType(const std::string& filepath)
 {
     return filepath.substr(filepath.find_last_of(".") + 1);
 }
 
-inline string toLower(const string& inp)
+inline std::string toLower(const std::string& inp)
 {
-    string out = inp;
+    std::string out = inp;
     std::transform(out.begin(), out.end(), out.begin(), ::tolower);
     return out;
 }
@@ -447,10 +548,10 @@ inline void setAllTensorScales(INetworkDefinition* network, float inScales = 2.0
     }
 }
 
-inline void setDummyInt8Scales(const IBuilder* b, INetworkDefinition* n)
+inline void setDummyInt8Scales(const IBuilderConfig* c, INetworkDefinition* n)
 {
     // Set dummy tensor scales if Int8 mode is requested.
-    if (b->getInt8Mode())
+    if (c->getFlag(BuilderFlag::kINT8))
     {
         gLogWarning
             << "Int8 calibrator not provided. Generating dummy per tensor scales. Int8 accuracy is not guaranteed."
@@ -459,26 +560,30 @@ inline void setDummyInt8Scales(const IBuilder* b, INetworkDefinition* n)
     }
 }
 
-inline void enableDLA(IBuilder* b, int useDLACore, bool allowGPUFallback = true)
+inline void enableDLA(IBuilder* builder, IBuilderConfig* config, int useDLACore, bool allowGPUFallback = true)
 {
     if (useDLACore >= 0)
     {
-        if (b->getNbDLACores() == 0)
+        if (builder->getNbDLACores() == 0)
         {
             std::cerr << "Trying to use DLA core " << useDLACore << " on a platform that doesn't have any DLA cores"
                       << std::endl;
             assert("Error: use DLA core on a platfrom that doesn't have any DLA cores" && false);
         }
-        b->allowGPUFallback(allowGPUFallback);
-        if (!b->getInt8Mode())
+        if (allowGPUFallback)
+        {
+            config->setFlag(BuilderFlag::kGPU_FALLBACK);
+        }
+        if (!builder->getInt8Mode() && !config->getFlag(BuilderFlag::kINT8))
         {
             // User has not requested INT8 Mode.
             // By default run in FP16 mode. FP32 mode is not permitted.
-            b->setFp16Mode(true);
+            builder->setFp16Mode(true);
+            config->setFlag(BuilderFlag::kFP16);
         }
-        b->setDefaultDeviceType(DeviceType::kDLA);
-        b->setDLACore(useDLACore);
-        b->setStrictTypeConstraints(true);
+        config->setDefaultDeviceType(DeviceType::kDLA);
+        config->setDLACore(useDLACore);
+        config->setFlag(BuilderFlag::kSTRICT_TYPES);
     }
 }
 
@@ -488,7 +593,7 @@ inline int parseDLA(int argc, char** argv)
     {
         std::string arg(argv[i]);
         if (strncmp(argv[i], "--useDLACore=", 13) == 0)
-            return stoi(argv[i] + 13);
+            return std::stoi(argv[i] + 13);
     }
     return -1;
 }
@@ -523,6 +628,12 @@ inline unsigned int elementSize(DataType t)
     return 0;
 }
 
+template <typename A, typename B>
+inline A divUp(A x, B n)
+{
+    return (x + n - 1) / n;
+}
+
 template <int C, int H, int W>
 struct PPM
 {
@@ -555,7 +666,7 @@ inline void readPPMFile(const std::string& filename, samplesCommon::PPM<C, H, W>
     infile.read(reinterpret_cast<char*>(ppm.buffer), ppm.w * ppm.h * 3);
 }
 
-void readPPMFile(const std::string& filename, vPPM& ppm, std::vector<std::string>& input_dir)
+inline void readPPMFile(const std::string& filename, vPPM& ppm, std::vector<std::string>& input_dir)
 {
     ppm.fileName = filename;
     std::ifstream infile(locateFile(filename, input_dir), std::ifstream::binary);
@@ -609,7 +720,7 @@ inline void writePPMFileWithBBox(const std::string& filename, PPM<C, H, W>& ppm,
     outfile.write(reinterpret_cast<char*>(ppm.buffer), ppm.w * ppm.h * 3);
 }
 
-void writePPMFileWithBBox(const std::string& filename, vPPM ppm, std::vector<BBox>& dets)
+inline void writePPMFileWithBBox(const std::string& filename, vPPM ppm, std::vector<BBox>& dets)
 {
     std::ofstream outfile("./" + filename, std::ofstream::binary);
     assert(!outfile.fail());
@@ -764,6 +875,23 @@ inline int getW(const Dims& d)
     return d.nbDims >= 1 ? d.d[d.nbDims - 1] : 1;
 }
 
+inline void loadLibrary(const std::string& path)
+{
+#ifdef _MSC_VER
+    void* handle = LoadLibrary(path.c_str());
+#else
+    void* handle = dlopen(path.c_str(), RTLD_LAZY);
+#endif
+    if (handle == nullptr)
+    {
+#ifdef _MSC_VER
+        gLogError << "Could not load plugin library: " << path << std::endl;
+#else
+        gLogError << "Could not load plugin library: " << path << ", due to: " << dlerror() << std::endl;
+#endif
+    }
+}
+
 } // namespace samplesCommon
 
 inline std::ostream& operator<<(std::ostream& os, const nvinfer1::Dims& dims)
@@ -776,20 +904,4 @@ inline std::ostream& operator<<(std::ostream& os, const nvinfer1::Dims& dims)
     return os << ")";
 }
 
-inline int loadLibrary(const std::string& libName)
-{
-    void *dlhandle = nullptr;
-#if !defined(_WIN32)
-    dlhandle = dlopen(libName.c_str(), RTLD_LAZY);
-#else
-    dlhandle = LoadLibrary(libName.c_str());
-#endif
-    if (!dlhandle)
-    {
-	std::cerr << "Error loading library: " << libName << " Error code: " << dlerror() << std::endl;
-        return EXIT_FAILURE;
-    }
-    return EXIT_SUCCESS;
-}
-
 #endif // TENSORRT_COMMON_H
diff --git a/samples/common/getOptions.cpp b/samples/common/getOptions.cpp
new file mode 100644
index 00000000..e35af11b
--- /dev/null
+++ b/samples/common/getOptions.cpp
@@ -0,0 +1,247 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "getOptions.h"
+#include "logger.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cctype>
+#include <cstring>
+#include <set>
+
+namespace nvinfer1
+{
+namespace utility
+{
+
+//! Matching for TRTOptions is defined as follows:
+//!
+//! If A and B both have longName set, A matches B if and only if A.longName ==
+//! B.longName and (A.shortName == B.shortName if both have short name set).
+//!
+//! If A only has shortName set and B only has longName set, then A does not
+//! match B. It is assumed that when 2 TRTOptions are compared, one of them is
+//! the definition of a TRTOption in the input to getOptions. As such, if the
+//! definition only has shortName set, it will never be equal to a TRTOption
+//! that does not have shortName set (and same for longName).
+//!
+//! If A and B both have shortName set but B does not have longName set, A
+//! matches B if and only if A.shortName == B.shortName.
+//!
+//! If A has neither long or short name set, A matches B if and only if B has
+//! neither long or short name set.
+bool matches(const TRTOption& a, const TRTOption& b)
+{
+    if (!a.longName.empty() && !b.longName.empty())
+    {
+        if (a.shortName && b.shortName)
+        {
+            return (a.longName == b.longName) && (a.shortName == b.shortName);
+        }
+        return a.longName == b.longName;
+    }
+
+    // If only one of them is not set, this will return false anyway.
+    return a.shortName == b.shortName;
+}
+
+//! getTRTOptionIndex returns the index of a TRTOption in a vector of
+//! TRTOptions, -1 if not found.
+int getTRTOptionIndex(const std::vector<TRTOption>& options, const TRTOption& opt)
+{
+    for (size_t i = 0; i < options.size(); ++i)
+    {
+        if (matches(opt, options[i]))
+        {
+            return i;
+        }
+    }
+    return -1;
+}
+
+//! validateTRTOption will return a string containing an error message if options
+//! contain non-numeric characters, or if there are duplicate option names found.
+//! Otherwise, returns the empty string.
+std::string validateTRTOption(
+    const std::set<char>& seenShortNames, const std::set<std::string>& seenLongNames, const TRTOption& opt)
+{
+    if (opt.shortName != 0)
+    {
+        if (!std::isalnum(opt.shortName))
+        {
+            return "Short name '" + std::to_string(opt.shortName) + "' is non-alphanumeric";
+        }
+
+        if (seenShortNames.find(opt.shortName) != seenShortNames.end())
+        {
+            return "Short name '" + std::to_string(opt.shortName) + "' is a duplicate";
+        }
+    }
+
+    if (!opt.longName.empty())
+    {
+        for (const char& c : opt.longName)
+        {
+            if (!std::isalnum(c) && c != '-' && c != '_')
+            {
+                return "Long name '" + opt.longName + "' contains characters that are not '-', '_', or alphanumeric";
+            }
+        }
+
+        if (seenLongNames.find(opt.longName) != seenLongNames.end())
+        {
+            return "Long name '" + opt.longName + "' is a duplicate";
+        }
+    }
+    return "";
+}
+
+//! validateTRTOptions will return a string containing an error message if any
+//! options contain non-numeric characters, or if there are duplicate option
+//! names found. Otherwise, returns the empty string.
+std::string validateTRTOptions(const std::vector<TRTOption>& options)
+{
+    std::set<char> seenShortNames;
+    std::set<std::string> seenLongNames;
+    for (size_t i = 0; i < options.size(); ++i)
+    {
+        const std::string errMsg = validateTRTOption(seenShortNames, seenLongNames, options[i]);
+        if (!errMsg.empty())
+        {
+            return "Error '" + errMsg + "' at TRTOption " + std::to_string(i);
+        }
+
+        seenShortNames.insert(options[i].shortName);
+        seenLongNames.insert(options[i].longName);
+    }
+    return "";
+}
+
+//! parseArgs parses an argument list and returns a TRTParsedArgs with the
+//! fields set accordingly. Assumes that options is validated.
+//! ErrMsg will be set if:
+//!     - an argument is null
+//!     - an argument is empty
+//!     - an argument does not have option (i.e. "-" and "--")
+//!     - a short argument has more than 1 character
+//!     - the last argument in the list requires a value
+TRTParsedArgs parseArgs(int argc, const char* const* argv, const std::vector<TRTOption>& options)
+{
+    TRTParsedArgs parsedArgs;
+    parsedArgs.values.resize(options.size());
+
+    for (int i = 1; i < argc; ++i) // index of current command-line argument
+    {
+        if (argv[i] == nullptr)
+        {
+            return TRTParsedArgs{"Null argument at index " + std::to_string(i)};
+        }
+
+        const std::string argStr(argv[i]);
+        if (argStr.empty())
+        {
+            return TRTParsedArgs{"Empty argument at index " + std::to_string(i)};
+        }
+
+        // No starting hyphen means it is a positional argument
+        if (argStr[0] != '-')
+        {
+            parsedArgs.positionalArgs.push_back(argStr);
+            continue;
+        }
+
+        if (argStr == "-" || argStr == "--")
+        {
+            return TRTParsedArgs{"Argument does not specify an option at index " + std::to_string(i)};
+        }
+
+        // If only 1 hyphen, char after is the flag.
+        TRTOption opt;
+        std::string value;
+        if (argStr[1] != '-')
+        {
+            // Must only have 1 char after the hyphen
+            if (argStr.size() > 2)
+            {
+                return TRTParsedArgs{"Short arg contains more than 1 character at index " + std::to_string(i)};
+            }
+            opt.shortName = argStr[1];
+        }
+        else
+        {
+            opt.longName = argStr.substr(2);
+
+            // We need to support --foo=bar syntax, so look for '='
+            const size_t eqIndex = opt.longName.find('=');
+            if (eqIndex < opt.longName.size())
+            {
+                value = opt.longName.substr(eqIndex + 1);
+                opt.longName = opt.longName.substr(0, eqIndex);
+            }
+        }
+
+        const int idx = getTRTOptionIndex(options, opt);
+        if (idx < 0)
+        {
+            continue;
+        }
+
+        if (options[idx].valueRequired)
+        {
+            if (!value.empty())
+            {
+                parsedArgs.values[idx].second.push_back(value);
+                parsedArgs.values[idx].first = parsedArgs.values[idx].second.size();
+                continue;
+            }
+
+            if (i + 1 >= argc)
+            {
+                return TRTParsedArgs{"Last argument requires value, but none given"};
+            }
+
+            const std::string nextArg(argv[i + 1]);
+            if (nextArg.size() >= 1 && nextArg[0] == '-')
+            {
+                gLogWarning << "Warning: Using '" << nextArg << "' as a value for '" << argStr
+                            << "', Should this be its own flag?" << std::endl;
+            }
+
+            parsedArgs.values[idx].second.push_back(nextArg);
+            i += 1; // Next argument already consumed
+
+            parsedArgs.values[idx].first = parsedArgs.values[idx].second.size();
+        }
+        else
+        {
+            parsedArgs.values[idx].first += 1;
+        }
+    }
+    return parsedArgs;
+}
+
+TRTParsedArgs getOptions(int argc, const char* const* argv, const std::vector<TRTOption>& options)
+{
+    const std::string errMsg = validateTRTOptions(options);
+    if (!errMsg.empty())
+    {
+        return TRTParsedArgs{errMsg};
+    }
+    return parseArgs(argc, argv, options);
+}
+} // namespace utility
+} // namespace nvinfer1
diff --git a/samples/common/getOptions.h b/samples/common/getOptions.h
new file mode 100644
index 00000000..a9edb0de
--- /dev/null
+++ b/samples/common/getOptions.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TRT_GET_OPTIONS_H
+#define TRT_GET_OPTIONS_H
+
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace nvinfer1
+{
+namespace utility
+{
+
+//! TRTOption defines a command line option. At least 1 of shortName and longName
+//! must be defined.
+//! If bool initialization is undefined behavior on your system, valueRequired
+//! must also be explicitly defined.
+//! helpText is optional.
+struct TRTOption
+{
+    char shortName;       //!< Option name in short (single hyphen) form (i.e. -a, -b)
+    std::string longName; //!< Option name in long (double hyphen) form (i.e. --foo, --bar)
+    bool valueRequired;   //!< True if a value is needed for an option (i.e. -N 4, --foo bar)
+    std::string helpText; //!< Text to show when printing out the command usage
+};
+
+//! TRTParsedArgs is returned by getOptions after it has parsed a command line
+//! argument list (argv).
+//!
+//! errMsg is a string containing an error message if any errors occurred. If it
+//! is empty, no errors occurred.
+//!
+//! values stores a vector of pairs for each option (ordered by order in the
+//! input). Each pair contains an int (the number of occurrences) and a vector
+//! of strings (a list of values). The user should know which of these to use,
+//! and which options required values. For non-value options, only occurrences is
+//! populated. For value-required options, occurrences == # of values. Values do
+//! not need to be unique.
+//!
+//! positionalArgs stores additional arguments that are passed in without an
+//! option (these must not start with a hyphen).
+struct TRTParsedArgs
+{
+    std::string errMsg;
+    std::vector<std::pair<int, std::vector<std::string>>> values;
+    std::vector<std::string> positionalArgs;
+};
+
+//! Parse the input arguments passed to main() and extract options as well as
+//! positional arguments.
+//!
+//! Options are supposed to be passed to main() with a preceding hyphen '-'.
+//!
+//! If there is a single preceding hyphen, there should be exactly 1 character
+//! after the hyphen, which is interpreted as the option.
+//!
+//! If there are 2 preceding hyphens, the entire argument (without the hyphens)
+//! is interpreted as the option.
+//!
+//! If the option requires a value, the next argument is used as the value.
+//!
+//! Positional arguments must not start with a hyphen.
+//!
+//! If an argument requires a value, the next argument is interpreted as the
+//! value, even if it is the form of a valid option (i.e. --foo --bar will store
+//! "--bar" as a value for option "foo" if "foo" requires a value).
+//! We also support --name=value syntax. In this case, 'value' would be used as
+//! the value, NOT the next argument.
+//!
+//! For options:
+//!   { { 'a', "", false },
+//!     { 'b', "", false },
+//!     { 0, "cee", false },
+//!     { 'd', "", true },
+//!     { 'e', "", true },
+//!     { 'f', "foo", true } }
+//!
+//! ./main hello world -a -a --cee -d 12 -f 34
+//! and
+//! ./main hello world -a -a --cee -d 12 --foo 34
+//!
+//! will result in:
+//!
+//! TRTParsedArgs {
+//!      errMsg: "",
+//!      values: { { 2, {} },
+//!                { 0, {} },
+//!                { 1, {} },
+//!                { 1, {"12"} },
+//!                { 0, {} },
+//!                { 1, {"34"} } }
+//!      positionalArgs: {"hello", "world"},
+//! }
+//!
+//! Non-POSIX behavior:
+//!      - Does not support "-abcde" as a shorthand for "-a -b -c -d -e". Each
+//!        option must have its own hyphen prefix.
+//!      - Does not support -e12 as a shorthand for "-e 12". Values MUST be
+//!        whitespace-separated from the option it is for.
+//!
+//! @param[in] argc The number of arguments passed to main (including the
+//!            file name, which is disregarded)
+//! @param[in] argv The arguments passed to main (including the file name,
+//!            which is disregarded)
+//! @param[in] options List of TRTOptions to parse
+//! @return TRTParsedArgs. See TRTParsedArgs documentation for descriptions of
+//!         the fields.
+TRTParsedArgs getOptions(int argc, const char* const* argv, const std::vector<TRTOption>& options);
+} // namespace utility
+} // namespace nvinfer1
+
+#endif // TRT_GET_OPTIONS_H
diff --git a/samples/common/logging.h b/samples/common/logging.h
index 32c04ce3..63a0a3a1 100644
--- a/samples/common/logging.h
+++ b/samples/common/logging.h
@@ -17,8 +17,10 @@
 #ifndef TENSORRT_LOGGING_H
 #define TENSORRT_LOGGING_H
 
-#include "NvInfer.h"
+#include "NvInferRuntimeCommon.h"
 #include <cassert>
+#include <ctime>
+#include <iomanip>
 #include <iostream>
 #include <ostream>
 #include <sstream>
@@ -66,6 +68,16 @@ class LogStreamConsumerBuffer : public std::stringbuf
     {
         if (mShouldLog)
         {
+            // prepend timestamp
+            std::time_t timestamp = std::time(nullptr);
+            tm* tm_local = std::localtime(&timestamp);
+            std::cout << "[";
+            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mon << "/";
+            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
+            std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
+            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
+            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
+            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
             // std::stringbuf::str() gets the string contents of the buffer
             // insert the buffer contents pre-appended by the appropriate prefix into the stream
             mOutput << mPrefix << str();
diff --git a/samples/common/sampleConfig.h b/samples/common/sampleConfig.h
index 94fe028b..a5b29c13 100644
--- a/samples/common/sampleConfig.h
+++ b/samples/common/sampleConfig.h
@@ -41,33 +41,22 @@ class SampleConfig : public nvonnxparser::IOnnxConfig
     std::string mReferenceFilename;
     std::string mOutputFilename;
     std::string mCalibrationFilename;
-    int64_t mMaxBatchSize;
-    int64_t mMaxWorkspaceSize;
-    int64_t mCalibBatchSize;
-    int64_t mMaxNCalibBatch;
-    int64_t mFirstCalibBatch;
-    int64_t mUseDLACore;
-    nvinfer1::DataType mModelDtype;
-    Verbosity mVerbosity;
-    bool mPrintLayercInfo;
-    bool mDebugBuilder;
-    InputDataFormat mInputDataFormat;
-    uint64_t mTopK;
+    int64_t mMaxBatchSize{32};
+    int64_t mMaxWorkspaceSize{1 * 1024 * 1024 * 1024};
+    int64_t mCalibBatchSize{0};
+    int64_t mMaxNCalibBatch{0};
+    int64_t mFirstCalibBatch{0};
+    int64_t mUseDLACore{-1};
+    nvinfer1::DataType mModelDtype{nvinfer1::DataType::kFLOAT};
+    Verbosity mVerbosity{static_cast<int>(nvinfer1::ILogger::Severity::kWARNING)};
+    bool mPrintLayercInfo{false};
+    bool mDebugBuilder{false};
+    InputDataFormat mInputDataFormat{InputDataFormat::kASCII};
+    uint64_t mTopK{0};
+    float mFailurePercentage{-1.0f};
 
 public:
     SampleConfig()
-        : mMaxBatchSize(32)
-        , mMaxWorkspaceSize(1 * 1024 * 1024 * 1024)
-        , mCalibBatchSize(0)
-        , mMaxNCalibBatch(0)
-        , mFirstCalibBatch(0)
-        , mUseDLACore(-1)
-        , mModelDtype(nvinfer1::DataType::kFLOAT)
-        , mVerbosity(static_cast<int>(nvinfer1::ILogger::Severity::kWARNING))
-        , mPrintLayercInfo(false)
-        , mDebugBuilder(false)
-        , mInputDataFormat(InputDataFormat::kASCII)
-        , mTopK(0)
     {
 #ifdef ONNX_DEBUG
         if (isDebug())
@@ -220,10 +209,10 @@ class SampleConfig : public nvonnxparser::IOnnxConfig
         return mDebugBuilder;
     } //!<  get the boolean variable, corresponding to the debug builder
 
-    const char* getImageFileName() const
+    const char* getImageFileName() const //!<  set Image file name (PPM or ASCII)
     {
         return mImageFilename.c_str();
-    }                                                //!<  set Image file name (PPM or ASCII)
+    }
     void setImageFileName(const char* imageFilename) //!< get the Image file name
     {
         mImageFilename = string(imageFilename);
@@ -246,10 +235,10 @@ class SampleConfig : public nvonnxparser::IOnnxConfig
         return mInputDataFormat;
     } //!<  returns the expected data format of the image file.
 
-    const char* getOutputFileName() const
+    const char* getOutputFileName() const //!<  specifies the file to save the results
     {
         return mOutputFilename.c_str();
-    }                                                  //!<  specifies the file to save the results
+    }
     void setOutputFileName(const char* outputFilename) //!<  get the output file name
     {
         mOutputFilename = string(outputFilename);
@@ -273,6 +262,16 @@ class SampleConfig : public nvonnxparser::IOnnxConfig
         mTopK = topK;
     } //!<  If this options is specified, return the K top probabilities.
 
+    float getFailurePercentage() const
+    {
+        return mFailurePercentage;
+    }
+
+    void setFailurePercentage(float f)
+    {
+        mFailurePercentage = f;
+    }
+
     bool isDebug() const
     {
 #if ONNX_DEBUG
diff --git a/samples/common/sampleEngines.cpp b/samples/common/sampleEngines.cpp
new file mode 100644
index 00000000..a0ad70a3
--- /dev/null
+++ b/samples/common/sampleEngines.cpp
@@ -0,0 +1,500 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <algorithm>
+#include <cuda.h>
+#include <fstream>
+#include <iostream>
+#include <iterator>
+#include <map>
+#include <random>
+#include <string>
+
+#include "NvCaffeParser.h"
+#include "NvInfer.h"
+#include "NvOnnxParser.h"
+#include "NvUffParser.h"
+
+#include "logger.h"
+#include "sampleOptions.h"
+#include "sampleEngines.h"
+#include "sampleUtils.h"
+
+using namespace nvinfer1;
+
+namespace sample
+{
+
+namespace
+{
+
+struct CaffeBufferShutter
+{
+    ~CaffeBufferShutter()
+    {
+        nvcaffeparser1::shutdownProtobufLibrary();
+    }
+};
+
+struct UffBufferShutter
+{
+    ~UffBufferShutter()
+    {
+        nvuffparser::shutdownProtobufLibrary();
+    }
+};
+
+} // namespace
+
+Parser modelToNetwork(const ModelOptions& model, nvinfer1::INetworkDefinition& network, std::ostream& err)
+{
+    Parser parser;
+    const std::string& modelName = model.baseModel.model;
+    switch (model.baseModel.format)
+    {
+    case ModelFormat::kCAFFE:
+    {
+        using namespace nvcaffeparser1;
+        parser.caffeParser.reset(createCaffeParser());
+        CaffeBufferShutter bufferShutter;
+        const auto blobNameToTensor = parser.caffeParser->parse(
+            model.prototxt.c_str(), modelName.empty() ? nullptr : modelName.c_str(), network, DataType::kFLOAT);
+        if (!blobNameToTensor)
+        {
+            err << "Failed to parse caffe model or prototxt, tensors blob not found" << std::endl;
+            parser.caffeParser.reset();
+            break;
+        }
+
+        for (const auto& s : model.outputs)
+        {
+            if (blobNameToTensor->find(s.c_str()) == nullptr)
+            {
+                err << "Could not find output blob " << s << std::endl;
+                parser.caffeParser.reset();
+                break;
+            }
+            network.markOutput(*blobNameToTensor->find(s.c_str()));
+        }
+        break;
+    }
+    case ModelFormat::kUFF:
+    {
+        using namespace nvuffparser;
+        parser.uffParser.reset(createUffParser());
+        UffBufferShutter bufferShutter;
+        for (const auto& s : model.uffInputs.inputs)
+        {
+            if (!parser.uffParser->registerInput(
+                    s.first.c_str(), s.second, model.uffInputs.NHWC ? UffInputOrder::kNHWC : UffInputOrder::kNCHW))
+            {
+                err << "Failed to register input " << s.first << std::endl;
+                parser.uffParser.reset();
+                break;
+            }
+        }
+
+        for (const auto& s : model.outputs)
+        {
+            if (!parser.uffParser->registerOutput(s.c_str()))
+            {
+                err << "Failed to register output " << s << std::endl;
+                parser.uffParser.reset();
+                break;
+            }
+        }
+
+        if (!parser.uffParser->parse(model.baseModel.model.c_str(), network))
+        {
+            err << "Failed to parse uff file" << std::endl;
+            parser.uffParser.reset();
+            break;
+        }
+        break;
+    }
+    case ModelFormat::kONNX:
+    {
+        using namespace nvonnxparser;
+        parser.onnxParser.reset(createParser(network, gLogger.getTRTLogger()));
+        if (!parser.onnxParser->parseFromFile(
+                model.baseModel.model.c_str(), static_cast<int>(gLogger.getReportableSeverity())))
+        {
+            err << "Failed to parse onnx file" << std::endl;
+            parser.onnxParser.reset();
+        }
+        break;
+    }
+    case ModelFormat::kANY: break;
+    }
+
+    return parser;
+}
+
+namespace
+{
+
+class RndInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator2
+{
+public:
+    RndInt8Calibrator(
+        int batches, const std::string& cacheFile, const nvinfer1::INetworkDefinition& network, std::ostream& err);
+
+    ~RndInt8Calibrator()
+    {
+        for (auto& elem : mInputDeviceBuffers)
+        {
+            cudaCheck(cudaFree(elem.second), mErr);
+        }
+    }
+
+    bool getBatch(void* bindings[], const char* names[], int nbBindings) override;
+
+    int getBatchSize() const override
+    {
+        return 1;
+    }
+
+    const void* readCalibrationCache(size_t& length) override;
+
+    virtual void writeCalibrationCache(const void*, size_t) override {}
+
+private:
+    int mBatches{};
+    int mCurrentBatch{};
+    std::string mCacheFile;
+    std::map<std::string, void*> mInputDeviceBuffers;
+    std::vector<char> mCalibrationCache;
+    std::ostream& mErr;
+};
+
+RndInt8Calibrator::RndInt8Calibrator(
+    int batches, const std::string& cacheFile, const INetworkDefinition& network, std::ostream& err)
+    : mBatches(batches)
+    , mCurrentBatch(0)
+    , mCacheFile(cacheFile)
+    , mErr(err)
+{
+    std::default_random_engine generator;
+    std::uniform_real_distribution<float> distribution(-1.0F, 1.0F);
+    auto gen = [&generator, &distribution]() { return distribution(generator); };
+
+    for (int i = 0; i < network.getNbInputs(); i++)
+    {
+        auto input = network.getInput(i);
+        int elemCount = volume(input->getDimensions());
+        std::vector<float> rnd_data(elemCount);
+        std::generate_n(rnd_data.begin(), elemCount, gen);
+
+        void* data;
+        cudaCheck(cudaMalloc(&data, elemCount * sizeof(float)), mErr);
+        cudaCheck(cudaMemcpy(data, rnd_data.data(), elemCount * sizeof(float), cudaMemcpyHostToDevice), mErr);
+
+        mInputDeviceBuffers.insert(std::make_pair(input->getName(), data));
+    }
+}
+
+bool RndInt8Calibrator::getBatch(void* bindings[], const char* names[], int nbBindings)
+{
+    if (mCurrentBatch >= mBatches)
+    {
+        return false;
+    }
+
+    for (int i = 0; i < nbBindings; ++i)
+    {
+        bindings[i] = mInputDeviceBuffers[names[i]];
+    }
+
+    ++mCurrentBatch;
+
+    return true;
+}
+
+const void* RndInt8Calibrator::readCalibrationCache(size_t& length)
+{
+    mCalibrationCache.clear();
+    std::ifstream input(mCacheFile, std::ios::binary);
+    input >> std::noskipws;
+    if (input.good())
+    {
+        std::copy(
+            std::istream_iterator<char>(input), std::istream_iterator<char>(), std::back_inserter(mCalibrationCache));
+    }
+
+    return mCalibrationCache.size() ? mCalibrationCache.data() : nullptr;
+}
+
+void setTensorScales(const INetworkDefinition& network, float inScales = 2.0f, float outScales = 4.0f)
+{
+    // Ensure that all layer inputs have a scale.
+    for (int l = 0; l < network.getNbLayers(); l++)
+    {
+        auto layer = network.getLayer(l);
+        for (int i = 0; i < layer->getNbInputs(); i++)
+        {
+            ITensor* input{layer->getInput(i)};
+            // Optional inputs are nullptr here and are from RNN layers.
+            if (input && !input->dynamicRangeIsSet())
+            {
+                input->setDynamicRange(-inScales, inScales);
+            }
+        }
+        for (int o = 0; o < layer->getNbOutputs(); o++)
+        {
+            ITensor* output{layer->getOutput(o)};
+            // Optional outputs are nullptr here and are from RNN layers.
+            if (output && !output->dynamicRangeIsSet())
+            {
+                // Pooling must have the same input and output scales.
+                if (layer->getType() == LayerType::kPOOLING)
+                {
+                    output->setDynamicRange(-inScales, inScales);
+                }
+                else
+                {
+                    output->setDynamicRange(-outScales, outScales);
+                }
+            }
+        }
+    }
+}
+
+} // namespace
+
+ICudaEngine* networkToEngine(const BuildOptions& build, const SystemOptions& sys, IBuilder& builder,
+    INetworkDefinition& network, std::ostream& err)
+{
+    unique_ptr<IBuilderConfig> config{builder.createBuilderConfig()};
+
+    IOptimizationProfile* profile{nullptr};
+    if (build.maxBatch)
+    {
+        builder.setMaxBatchSize(build.maxBatch);
+    }
+    else
+    {
+        if (!build.shapes.empty())
+        {
+            profile = builder.createOptimizationProfile();
+        }
+    }
+
+    for (unsigned int i = 0, n = network.getNbInputs(); i < n; i++)
+    {
+        // Set formats and data types of inputs
+        auto input = network.getInput(i);
+        if (!build.inputFormats.empty())
+        {
+            input->setType(build.inputFormats[i].first);
+            input->setAllowedFormats(build.inputFormats[i].second);
+        }
+        else
+        {
+            input->setType(DataType::kFLOAT);
+            input->setAllowedFormats(1U << static_cast<int>(TensorFormat::kLINEAR));
+        }
+
+        if (profile)
+        {
+            Dims dims = input->getDimensions();
+            if (std::any_of(dims.d + 1, dims.d + dims.nbDims, [](int dim) { return dim == -1; }))
+            {
+                err << "Only dynamic batch dimension is currently supported, other dimensions must be static"
+                    << std::endl;
+                return nullptr;
+            }
+            dims.d[0] = -1;
+            Dims profileDims = dims;
+            auto shape = build.shapes.find(input->getName());
+            if (shape == build.shapes.end())
+            {
+                err << "Dynamic dimensions required for input " << input->getName() << std::endl;
+                return nullptr;
+            }
+            profileDims.d[0] = shape->second[static_cast<size_t>(OptProfileSelector::kMIN)].d[0];
+            profile->setDimensions(input->getName(), OptProfileSelector::kMIN, profileDims);
+            profileDims.d[0] = shape->second[static_cast<size_t>(OptProfileSelector::kOPT)].d[0];
+            profile->setDimensions(input->getName(), OptProfileSelector::kOPT, profileDims);
+            profileDims.d[0] = shape->second[static_cast<size_t>(OptProfileSelector::kMAX)].d[0];
+            profile->setDimensions(input->getName(), OptProfileSelector::kMAX, profileDims);
+
+            input->setDimensions(dims);
+        }
+    }
+
+    if (profile)
+    {
+        if (!profile->isValid())
+        {
+            err << "Required optimization profile is invalid" << std::endl;
+            return nullptr;
+        }
+        config->addOptimizationProfile(profile);
+    }
+
+    for (unsigned int i = 0, n = network.getNbOutputs(); i < n; i++) // BUILD->NETWORK
+    {
+        // Set formats and data types of outputs
+        auto output = network.getOutput(i);
+        if (!build.outputFormats.empty())
+        {
+            output->setType(build.outputFormats[i].first);
+            output->setAllowedFormats(build.outputFormats[i].second);
+        }
+        else
+        {
+            output->setType(DataType::kFLOAT);
+            output->setAllowedFormats(1U << static_cast<int>(TensorFormat::kLINEAR));
+        }
+    }
+
+    config->setMaxWorkspaceSize(static_cast<size_t>(build.workspace) << 20);
+
+    if (build.fp16)
+    {
+        config->setFlag(BuilderFlag::kFP16);
+    }
+
+    if (build.int8)
+    {
+        config->setFlag(BuilderFlag::kINT8);
+    }
+
+    auto isInt8 = [](const IOFormat& format) { return format.first == DataType::kINT8; };
+    auto int8IO = std::count_if(build.inputFormats.begin(), build.inputFormats.end(), isInt8)
+        + std::count_if(build.outputFormats.begin(), build.outputFormats.end(), isInt8);
+
+    if ((build.int8 && build.calibration.empty()) || int8IO)
+    {
+        // Explicitly set int8 scales if no calibrator is provided and if I/O tensors use int8,
+        // because auto calibration does not support this case.
+        setTensorScales(network);
+    }
+    else if (build.int8)
+    {
+        config->setInt8Calibrator(new RndInt8Calibrator(1, build.calibration, network, err));
+    }
+
+    if (build.safe)
+    {
+        config->setEngineCapability(sys.DLACore != -1 ? EngineCapability::kSAFE_DLA : EngineCapability::kSAFE_GPU);
+    }
+
+    if (sys.DLACore != -1)
+    {
+        if (sys.DLACore < builder.getNbDLACores())
+        {
+            config->setDefaultDeviceType(DeviceType::kDLA);
+            config->setDLACore(sys.DLACore);
+            config->setFlag(BuilderFlag::kSTRICT_TYPES);
+
+            if (sys.fallback)
+            {
+                config->setFlag(BuilderFlag::kGPU_FALLBACK);
+            }
+            if (!build.int8)
+            {
+                config->setFlag(BuilderFlag::kFP16);
+            }
+        }
+        else
+        {
+            err << "Cannot create DLA engine, " << sys.DLACore << " not available" << std::endl;
+            return nullptr;
+        }
+    }
+
+    return builder.buildEngineWithConfig(network, *config);
+}
+
+ICudaEngine* modelToEngine(
+    const ModelOptions& model, const BuildOptions& build, const SystemOptions& sys, std::ostream& err)
+{
+    unique_ptr<IBuilder> builder{createInferBuilder(gLogger.getTRTLogger())};
+    if (builder == nullptr)
+    {
+        err << "Builder creation failed" << std::endl;
+        return nullptr;
+    }
+    auto batchFlag = (build.maxBatch ? 0U : 1U)
+        << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
+    unique_ptr<INetworkDefinition> network{builder->createNetworkV2(batchFlag)};
+    if (!network)
+    {
+        err << "Network creation failed" << std::endl;
+        return nullptr;
+    }
+    Parser parser = modelToNetwork(model, *network, err);
+    if (!parser)
+    {
+        err << "Parsing model failed" << std::endl;
+        return nullptr;
+    }
+
+    return networkToEngine(build, sys, *builder, *network, err);
+}
+
+ICudaEngine* loadEngine(const std::string& engine, int DLACore, std::ostream& err)
+{
+    std::ifstream engineFile(engine, std::ios::binary);
+    if (!engineFile)
+    {
+        err << "Error opening engine file: " << engine << std::endl;
+        return nullptr;
+    }
+
+    engineFile.seekg(0, engineFile.end);
+    long int fsize = engineFile.tellg();
+    engineFile.seekg(0, engineFile.beg);
+
+    std::vector<char> engineData(fsize);
+    engineFile.read(engineData.data(), fsize);
+    if (!engineFile)
+    {
+        err << "Error loading engine file: " << engine << std::endl;
+        return nullptr;
+    }
+
+    unique_ptr<IRuntime> runtime{createInferRuntime(gLogger.getTRTLogger())};
+    if (DLACore != -1)
+    {
+        runtime->setDLACore(DLACore);
+    }
+
+    return runtime->deserializeCudaEngine(engineData.data(), fsize, nullptr);
+}
+
+bool saveEngine(const ICudaEngine& engine, const std::string& fileName, std::ostream& err)
+{
+    std::ofstream engineFile(fileName, std::ios::binary);
+    if (!engineFile)
+    {
+        err << "Cannot open engine file: " << fileName << std::endl;
+        return false;
+    }
+
+    unique_ptr<IHostMemory> serializedEngine{engine.serialize()};
+    if (serializedEngine == nullptr)
+    {
+        err << "Engine serialization failed" << std::endl;
+        return false;
+    }
+
+    engineFile.write(static_cast<char*>(serializedEngine->data()), serializedEngine->size());
+    return !engineFile.fail();
+}
+
+} // namespace sample
diff --git a/samples/common/sampleEngines.h b/samples/common/sampleEngines.h
new file mode 100644
index 00000000..9d3e3804
--- /dev/null
+++ b/samples/common/sampleEngines.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TRT_SAMPLE_ENGINES_H
+#define TRT_SAMPLE_ENGINES_H
+
+#include <iostream>
+
+#include "NvCaffeParser.h"
+#include "NvInfer.h"
+#include "NvOnnxParser.h"
+#include "NvUffParser.h"
+
+#include "sampleUtils.h"
+
+namespace sample
+{
+
+struct Parser
+{
+    unique_ptr<nvcaffeparser1::ICaffeParser> caffeParser;
+    unique_ptr<nvuffparser::IUffParser> uffParser;
+    unique_ptr<nvonnxparser::IParser> onnxParser;
+
+    operator bool() const
+    {
+        return caffeParser || uffParser || onnxParser;
+    }
+};
+
+//!
+//! \brief Generate a network definition for a given model
+//!
+//! \return Parser The parser used to initialize the network and that holds the weights for the network, or an invalid
+//! parser (the returned parser converts to false if tested)
+//!
+//! \see Parser::operator bool()
+//!
+Parser modelToNetwork(const ModelOptions& model, nvinfer1::INetworkDefinition& network, std::ostream& err);
+
+//!
+//! \brief Create an engine for a network defintion
+//!
+//! \return Pointer to the engine created or nullptr if the creation failed
+//!
+nvinfer1::ICudaEngine* networkToEngine(const BuildOptions& build, const SystemOptions& sys, nvinfer1::IBuilder& builder,
+    nvinfer1::INetworkDefinition& network, std::ostream& err);
+
+//!
+//! \brief Create an engine for a given model
+//!
+//! \return Pointer to the engine created or nullptr if the creation failed
+//!
+nvinfer1::ICudaEngine* modelToEngine(
+    const ModelOptions& model, const BuildOptions& build, const SystemOptions& sys, std::ostream& err);
+
+//!
+//! \brief Load a serialized engine
+//!
+//! \return Pointer to the engine loaded or nullptr if the operation failed
+//!
+nvinfer1::ICudaEngine* loadEngine(const std::string& engine, int DLACore, std::ostream& err);
+
+//!
+//! \brief Save an engine into a file
+//!
+//! \return boolean Return true if the engine was successfully saved
+//!
+bool saveEngine(const nvinfer1::ICudaEngine& engine, const std::string& fileName, std::ostream& err);
+
+} // namespace sample
+
+#endif // TRT_SAMPLE_ENGINES_H
diff --git a/samples/common/sampleOptions.cpp b/samples/common/sampleOptions.cpp
new file mode 100644
index 00000000..f83d5362
--- /dev/null
+++ b/samples/common/sampleOptions.cpp
@@ -0,0 +1,867 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <algorithm>
+#include <cstring>
+#include <functional>
+#include <iostream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include "NvInfer.h"
+
+#include "sampleOptions.h"
+
+namespace sample
+{
+
+Arguments argsToArgumentsMap(int argc, char* argv[])
+{
+    Arguments arguments;
+    for (int i = 1; i < argc; ++i)
+    {
+        auto valuePtr = strchr(argv[i], '=');
+        if (valuePtr)
+        {
+            std::string value{valuePtr + 1};
+            arguments.emplace(std::string(argv[i], valuePtr - argv[i]), value);
+        }
+        else
+        {
+            arguments.emplace(argv[i], "");
+        }
+    }
+    return arguments;
+}
+
+void BaseModelOptions::parse(Arguments& arguments)
+{
+    if (checkEraseOption(arguments, "--onnx", model))
+    {
+        format = ModelFormat::kONNX;
+    }
+    else if (checkEraseOption(arguments, "--uff", model))
+    {
+        format = ModelFormat::kUFF;
+    }
+    else if (checkEraseOption(arguments, "--model", model))
+    {
+        format = ModelFormat::kCAFFE;
+    }
+}
+
+void UffInput::parse(Arguments& arguments)
+{
+    checkEraseOption(arguments, "--uffNHWC", NHWC);
+    std::vector<std::string> args;
+    if (checkEraseRepeatedOption(arguments, "--uffInput", args))
+    {
+        for (const auto& i : args)
+        {
+            std::vector<std::string> values{splitToStringVec(i, ',')};
+            if (values.size() == 4)
+            {
+                nvinfer1::Dims3 dims{std::stoi(values[1]), std::stoi(values[2]), std::stoi(values[3])};
+                inputs.emplace_back(values[0], dims);
+            }
+            else
+            {
+                throw std::invalid_argument(std::string("Invalid uffInput ") + i);
+            }
+        }
+    }
+}
+
+void ModelOptions::parse(Arguments& arguments)
+{
+    baseModel.parse(arguments);
+
+    switch (baseModel.format)
+    {
+    case ModelFormat::kCAFFE:
+    {
+        checkEraseOption(arguments, "--deploy", prototxt);
+        break;
+    }
+    case ModelFormat::kUFF:
+    {
+        uffInputs.parse(arguments);
+        if (uffInputs.inputs.empty())
+        {
+            throw std::invalid_argument("Uff models require at least one input");
+        }
+        break;
+    }
+    case ModelFormat::kONNX: break;
+    case ModelFormat::kANY:
+    {
+        if (checkEraseOption(arguments, "--deploy", prototxt))
+        {
+            baseModel.format = ModelFormat::kCAFFE;
+        }
+        break;
+    }
+    }
+    if (baseModel.format == ModelFormat::kCAFFE || baseModel.format == ModelFormat::kUFF)
+    {
+        std::vector<std::string> outArgs;
+        if (checkEraseRepeatedOption(arguments, "--output", outArgs))
+        {
+            for (const auto& o : outArgs)
+            {
+                for (auto& v : splitToStringVec(o, ','))
+                {
+                    outputs.emplace_back(std::move(v));
+                }
+            }
+        }
+        if (outputs.empty())
+        {
+            throw std::invalid_argument("Caffe and Uff models require at least one output");
+        }
+    }
+}
+
+namespace
+{
+
+void insertShapes(
+    std::unordered_map<std::string, ShapeRange>& shapes, const std::string& name, const nvinfer1::Dims& dims)
+{
+    std::pair<std::string, ShapeRange> profile;
+    profile.first = name;
+    profile.second[static_cast<size_t>(nvinfer1::OptProfileSelector::kMIN)] = dims;
+    profile.second[static_cast<size_t>(nvinfer1::OptProfileSelector::kOPT)] = dims;
+    profile.second[static_cast<size_t>(nvinfer1::OptProfileSelector::kMAX)] = dims;
+    shapes.insert(profile);
+}
+
+} // namespace
+
+void BuildOptions::parse(Arguments& arguments)
+{
+    auto getFormats = [&arguments](std::vector<IOFormat>& formatsVector, const char* argument) {
+        std::string list;
+        checkEraseOption(arguments, argument, list);
+        std::vector<std::string> formats{splitToStringVec(list, ',')};
+        for (const auto& f : formats)
+        {
+            formatsVector.push_back(stringToValue<IOFormat>(f));
+        }
+    };
+
+    getFormats(inputFormats, "--inputIOFormats");
+    getFormats(outputFormats, "--outputIOFormats");
+
+    auto getShapes = [&arguments](std::unordered_map<std::string, ShapeRange>& shapes, const char* argument,
+                         nvinfer1::OptProfileSelector selector) {
+        std::string list;
+        checkEraseOption(arguments, argument, list);
+        std::vector<std::string> shapeList{splitToStringVec(list, ',')};
+        for (const auto& s : shapeList)
+        {
+            std::vector<std::string> nameRange{splitToStringVec(s, ':')};
+            if (shapes.find(nameRange[0]) == shapes.end())
+            {
+                auto dims = stringToValue<nvinfer1::Dims>(nameRange[1]);
+                insertShapes(shapes, nameRange[0], dims);
+            }
+            else
+            {
+                shapes[nameRange[0]][static_cast<size_t>(selector)] = stringToValue<nvinfer1::Dims>(nameRange[1]);
+            }
+        }
+    };
+
+    bool explicitBatch{false};
+    checkEraseOption(arguments, "--explicitBatch", explicitBatch);
+    getShapes(shapes, "--minShapes", nvinfer1::OptProfileSelector::kMIN);
+    getShapes(shapes, "--optShapes", nvinfer1::OptProfileSelector::kOPT);
+    getShapes(shapes, "--maxShapes", nvinfer1::OptProfileSelector::kMAX);
+    explicitBatch = explicitBatch || !shapes.empty();
+
+    int batch{0};
+    checkEraseOption(arguments, "--maxBatch", batch);
+    if (explicitBatch && batch)
+    {
+        throw std::invalid_argument(
+            "Explicit batch or dynamic shapes enabled with implicit maxBatch " + std::to_string(batch));
+    }
+
+    if (explicitBatch)
+    {
+        maxBatch = 0;
+    }
+    else
+    {
+        if (batch)
+        {
+            maxBatch = batch;
+        }
+    }
+
+    checkEraseOption(arguments, "--workspace", workspace);
+    checkEraseOption(arguments, "--minTiming", minTiming);
+    checkEraseOption(arguments, "--avgTiming", avgTiming);
+    checkEraseOption(arguments, "--fp16", fp16);
+    checkEraseOption(arguments, "--int8", int8);
+    checkEraseOption(arguments, "--safe", safe);
+    checkEraseOption(arguments, "--calib", calibration);
+    if (checkEraseOption(arguments, "--loadEngine", engine))
+    {
+        load = true;
+    }
+    if (checkEraseOption(arguments, "--saveEngine", engine))
+    {
+        save = true;
+    }
+    if (load && save)
+    {
+        throw std::invalid_argument("Incompatible load and save engine options selected");
+    }
+}
+
+void SystemOptions::parse(Arguments& arguments)
+{
+    checkEraseOption(arguments, "--device", device);
+    checkEraseOption(arguments, "--useDLACore", DLACore);
+    checkEraseOption(arguments, "--allowGPUFallback", fallback);
+    std::string pluginName;
+    while (checkEraseOption(arguments, "--plugins", pluginName))
+    {
+        plugins.emplace_back(pluginName);
+    }
+}
+
+void InferenceOptions::parse(Arguments& arguments)
+{
+    checkEraseOption(arguments, "--streams", streams);
+    checkEraseOption(arguments, "--iterations", iterations);
+    checkEraseOption(arguments, "--duration", duration);
+    checkEraseOption(arguments, "--warmUp", warmup);
+    checkEraseOption(arguments, "--threads", threads);
+    checkEraseOption(arguments, "--useCudaGraph", graph);
+    checkEraseOption(arguments, "--buildOnly", skip);
+
+    std::string list;
+    checkEraseOption(arguments, "--shapes", list);
+    std::vector<std::string> shapeList{splitToStringVec(list, ',')};
+    for (const auto& s : shapeList)
+    {
+        std::vector<std::string> shapeSpec{splitToStringVec(s, ':')};
+        shapes.insert({shapeSpec[0], stringToValue<nvinfer1::Dims>(shapeSpec[1])});
+    }
+
+    int batchOpt{0};
+    checkEraseOption(arguments, "--batch", batchOpt);
+    if (!shapes.empty() && batchOpt)
+    {
+        throw std::invalid_argument(
+            "Explicit batch or dynamic shapes enabled with implicit batch " + std::to_string(batchOpt));
+    }
+    if (batchOpt)
+    {
+        batch = batchOpt;
+    }
+    else
+    {
+        if (!shapes.empty())
+        {
+            batch = 0;
+        }
+    }
+}
+
+void ReportingOptions::parse(Arguments& arguments)
+{
+    checkEraseOption(arguments, "--percentile", percentile);
+    checkEraseOption(arguments, "--avgRuns", avgs);
+    checkEraseOption(arguments, "--verbose", verbose);
+    checkEraseOption(arguments, "--dumpOutput", output);
+    checkEraseOption(arguments, "--dumpProfile", profile);
+    checkEraseOption(arguments, "--exportTimes", exportTimes);
+    checkEraseOption(arguments, "--exportProfile", exportProfile);
+    if (percentile < 0 || percentile > 100)
+    {
+        throw std::invalid_argument(std::string("Percentile ") + std::to_string(percentile) + "is not in [0,100]");
+    }
+}
+
+bool parseHelp(Arguments& arguments)
+{
+    bool help{false};
+    checkEraseOption(arguments, "--help", help);
+    return help;
+}
+
+void AllOptions::parse(Arguments& arguments)
+{
+    model.parse(arguments);
+    build.parse(arguments);
+    system.parse(arguments);
+    inference.parse(arguments);
+
+    if ((!build.maxBatch && inference.batch && inference.batch != defaultBatch)
+        || (build.maxBatch && build.maxBatch != defaultMaxBatch && !inference.batch))
+    {
+        // If either has selected implict batch and the other has selected explicit batch
+        throw std::invalid_argument("Conflicting build and inference batch settings");
+    }
+
+    if (build.shapes.empty() && !inference.shapes.empty())
+    {
+        for (auto& s : inference.shapes)
+        {
+            insertShapes(build.shapes, s.first, s.second);
+        }
+        build.maxBatch = 0;
+    }
+    else
+    {
+        if (!build.shapes.empty() && inference.shapes.empty())
+        {
+            for (auto& s : build.shapes)
+            {
+                inference.shapes.insert({s.first, s.second[static_cast<size_t>(nvinfer1::OptProfileSelector::kOPT)]});
+            }
+        }
+        if (!build.maxBatch)
+        {
+            inference.batch = 0;
+        }
+    }
+
+    if (build.maxBatch && inference.batch)
+    {
+        // For implicit batch, check for compatibility and if --maxBatch is not given and inference batch is greater
+        // than maxBatch, use inference batch also for maxBatch
+        if (build.maxBatch != defaultMaxBatch && build.maxBatch < inference.batch)
+        {
+            throw std::invalid_argument("Build max batch " + std::to_string(build.maxBatch)
+                + " is less than inference batch " + std::to_string(inference.batch));
+        }
+        else
+        {
+            if (build.maxBatch < inference.batch)
+            {
+                build.maxBatch = inference.batch;
+            }
+        }
+    }
+
+    reporting.parse(arguments);
+    helps = parseHelp(arguments);
+
+    if (!helps)
+    {
+        if (!build.load && model.baseModel.format == ModelFormat::kANY)
+        {
+            throw std::invalid_argument("Model missing or format not recognized");
+        }
+        if (!build.load && !build.maxBatch && model.baseModel.format != ModelFormat::kONNX)
+        {
+            throw std::invalid_argument("Explicit batch size not supported for Caffe and Uff models");
+        }
+        if (build.safe && system.DLACore >= 0)
+        {
+            auto checkSafeDLAFormats = [](const std::vector<IOFormat>& fmt) {
+                return fmt.empty() ? false : std::all_of(fmt.begin(), fmt.end(), [](const IOFormat& pair) {
+                    bool supported{false};
+                    supported |= pair.first == nvinfer1::DataType::kINT8
+                        && pair.second == 1U << static_cast<int>(nvinfer1::TensorFormat::kCHW32);
+                    supported |= pair.first == nvinfer1::DataType::kHALF
+                        && pair.second == 1U << static_cast<int>(nvinfer1::TensorFormat::kCHW16);
+                    return supported;
+                });
+            };
+            if (!checkSafeDLAFormats(build.inputFormats) || !checkSafeDLAFormats(build.inputFormats))
+            {
+                throw std::invalid_argument(
+                    "I/O formats for safe DLA capability are restricted to fp16:chw16 or int8:chw32");
+            }
+            if (system.fallback)
+            {
+                throw std::invalid_argument("GPU fallback (--allowGPUFallback) not allowed for safe DLA capability");
+            }
+        }
+    }
+}
+
+std::ostream& operator<<(std::ostream& os, const BaseModelOptions& options)
+{
+    os << "=== Model Options ===" << std::endl;
+
+    os << "Format: ";
+    switch (options.format)
+    {
+    case ModelFormat::kCAFFE:
+    {
+        os << "Caffe";
+        break;
+    }
+    case ModelFormat::kONNX:
+    {
+        os << "ONNX";
+        break;
+    }
+    case ModelFormat::kUFF:
+    {
+        os << "UFF";
+        break;
+    }
+    case ModelFormat::kANY: os << "*"; break;
+    }
+    os << std::endl << "Model: " << options.model << std::endl;
+
+    return os;
+}
+
+std::ostream& operator<<(std::ostream& os, const UffInput& input)
+{
+    os << "Uff Inputs Layout: " << (input.NHWC ? "NHWC" : "NCHW") << std::endl;
+    for (const auto& i : input.inputs)
+    {
+        os << "Input: " << i.first << "," << i.second.d[0] << "," << i.second.d[1] << "," << i.second.d[2] << std::endl;
+    }
+
+    return os;
+}
+
+std::ostream& operator<<(std::ostream& os, const ModelOptions& options)
+{
+    os << options.baseModel;
+    switch (options.baseModel.format)
+    {
+    case ModelFormat::kCAFFE:
+    {
+        os << "Prototxt: " << options.prototxt;
+        break;
+    }
+    case ModelFormat::kUFF:
+    {
+        os << options.uffInputs;
+        break;
+    }
+    case ModelFormat::kONNX: // Fallthrough: No options to report for ONNX or the generic case
+    case ModelFormat::kANY: break;
+    }
+
+    os << "Output:";
+    for (const auto& o : options.outputs)
+    {
+        os << " " << o;
+    }
+    os << std::endl;
+
+    return os;
+}
+
+std::ostream& operator<<(std::ostream& os, const IOFormat& format)
+{
+    switch (format.first)
+    {
+    case nvinfer1::DataType::kFLOAT:
+    {
+        os << "fp32:";
+        break;
+    }
+    case nvinfer1::DataType::kHALF:
+    {
+        os << "fp16:";
+        break;
+    }
+    case nvinfer1::DataType::kINT8:
+    {
+        os << "int8:";
+        break;
+    }
+    case nvinfer1::DataType::kINT32:
+    {
+        os << "int32:";
+        break;
+    }
+    }
+
+    for (int f = 0; f < nvinfer1::EnumMax<nvinfer1::TensorFormat>(); ++f)
+    {
+        if ((1U << f) & format.second)
+        {
+            if (f)
+            {
+                os << "+";
+            }
+            switch (nvinfer1::TensorFormat(f))
+            {
+            case nvinfer1::TensorFormat::kLINEAR:
+            {
+                os << "chw";
+                break;
+            }
+            case nvinfer1::TensorFormat::kCHW2:
+            {
+                os << "chw2";
+                break;
+            }
+            case nvinfer1::TensorFormat::kHWC8:
+            {
+                os << "hwc8";
+                break;
+            }
+            case nvinfer1::TensorFormat::kCHW4:
+            {
+                os << "chw4";
+                break;
+            }
+            case nvinfer1::TensorFormat::kCHW16:
+            {
+                os << "chw16";
+                break;
+            }
+            case nvinfer1::TensorFormat::kCHW32:
+            {
+                os << "chw32";
+                break;
+            }
+            }
+        }
+    }
+    return os;
+};
+
+std::ostream& operator<<(std::ostream& os, const nvinfer1::Dims& dims)
+{
+    for (int i = 0; i < dims.nbDims; ++i)
+    {
+        os << (i ? "x" : "") << dims.d[i];
+    }
+    return os;
+}
+
+std::ostream& operator<<(std::ostream& os, const ShapeRange& dims)
+{
+    int i = 0;
+    for (const auto& d : dims)
+    {
+        if (!d.nbDims)
+        {
+            break;
+        }
+        os << (i ? "+" : "") << d;
+        ++i;
+    }
+    return os;
+}
+
+namespace
+{
+
+template <typename T>
+void printShapes(std::ostream& os, const char* phase, const T& shapes)
+{
+    if (shapes.empty())
+    {
+        os << "Input " << phase << " shapes: model" << std::endl;
+    }
+    else
+    {
+        for (const auto& s : shapes)
+        {
+            os << "Input " << phase << " shape: " << s.first << "=" << s.second << std::endl;
+        }
+    }
+}
+
+std::ostream& printBatch(std::ostream& os, int maxBatch)
+{
+    if (maxBatch)
+    {
+        os << maxBatch;
+    }
+    else
+    {
+        os << "explicit";
+    }
+    return os;
+}
+
+} // namespace
+
+std::ostream& operator<<(std::ostream& os, const BuildOptions& options)
+{
+    // clang-format off
+    os << "=== Build Options ==="                                                                                       << std::endl <<
+
+          "Max batch: ";        printBatch(os, options.maxBatch)                                                        << std::endl <<
+          "Workspace: "      << options.workspace << " MB"                                                              << std::endl <<
+          "minTiming: "      << options.minTiming                                                                       << std::endl <<
+          "avgTiming: "      << options.avgTiming                                                                       << std::endl <<
+          "Precision: "      << (options.fp16 ? "FP16" : (options.int8 ? "INT8" : "FP32"))                              << std::endl <<
+          "Calibration: "    << (options.int8 && options.calibration.empty() ? "Dynamic" : options.calibration.c_str()) << std::endl <<
+          "Safe mode: "      << boolToEnabled(options.safe)                                                             << std::endl <<
+          "Save engine: "    << (options.save ? options.engine : "")                                                    << std::endl <<
+          "Load engine: "    << (options.load ? options.engine : "")                                                    << std::endl;
+    // clang-format on
+
+    auto printIOFormats = [](std::ostream& os, const char* direction, const std::vector<IOFormat> formats) {
+        if (formats.empty())
+        {
+            os << direction << "s format: fp32:CHW" << std::endl;
+        }
+        else
+        {
+            for (const auto& f : formats)
+            {
+                os << direction << ": " << f << std::endl;
+            }
+        }
+    };
+
+    printIOFormats(os, "Input", options.inputFormats);
+    printIOFormats(os, "Output", options.outputFormats);
+    printShapes(os, "build", options.shapes);
+
+    return os;
+}
+
+std::ostream& operator<<(std::ostream& os, const SystemOptions& options)
+{
+    // clang-format off
+    os << "=== System Options ==="                                                                << std::endl <<
+
+          "Device: "  << options.device                                                           << std::endl <<
+          "DLACore: " << (options.DLACore != -1 ? std::to_string(options.DLACore) : "")           <<
+                         (options.DLACore != -1 && options.fallback ? "(With GPU fallback)" : "") << std::endl;
+    // clang-format on
+    os << "Plugins:";
+    for (const auto p : options.plugins)
+    {
+        os << " " << p;
+    }
+    os << std::endl;
+
+    return os;
+}
+
+std::ostream& operator<<(std::ostream& os, const InferenceOptions& options)
+{
+    // clang-format off
+    os << "=== Inference Options ==="                                        << std::endl <<
+
+          "Batch: ";
+    if (options.batch && options.shapes.empty())
+    {
+                          os << options.batch                                << std::endl;
+    }
+    else
+    {
+                          os << "Explicit"                                   << std::endl;
+    }
+    os << "Iterations: "     << options.iterations << " (" << options.warmup <<
+                                                      " ms warm up)"         << std::endl <<
+          "Duration: "       << options.duration   << "s"                    << std::endl <<
+          "Sleep time: "     << options.sleep      << "ms"                   << std::endl <<
+          "Streams: "        << options.streams                              << std::endl <<
+          "Multithreading: " << boolToEnabled(options.threads)               << std::endl <<
+          "CUDA Graph: "     << boolToEnabled(options.graph)                 << std::endl <<
+          "Skip inference: " << boolToEnabled(options.skip)                  << std::endl;
+    // clang-format on
+    if (options.batch)
+    {
+        printShapes(os, "inference", options.shapes);
+    }
+
+    return os;
+}
+
+std::ostream& operator<<(std::ostream& os, const ReportingOptions& options)
+{
+    // clang-format off
+    os << "=== Reporting Options ==="                                       << std::endl <<
+
+          "Verbose: "                     << boolToEnabled(options.verbose) << std::endl <<
+          "Averages: "                    << options.avgs << " inferences"  << std::endl <<
+          "Percentile: "                  << options.percentile             << std::endl <<
+          "Dump output: "                 << boolToEnabled(options.output)  << std::endl <<
+          "Profile: "                     << boolToEnabled(options.profile) << std::endl <<
+          "Export timing to JSON file: "  << options.exportTimes            << std::endl <<
+          "Export profile to JSON file: " << options.exportProfile          << std::endl;
+    // clang-format on
+
+    return os;
+}
+
+std::ostream& operator<<(std::ostream& os, const AllOptions& options)
+{
+    os << options.model << options.build << options.system << options.inference << options.reporting << std::endl;
+    return os;
+}
+
+void BaseModelOptions::help(std::ostream& os)
+{
+    // clang-format off
+    os << "  --uff=<file>                UFF model"                                             << std::endl <<
+          "  --onnx=<file>               ONNX model"                                            << std::endl <<
+          "  --model=<file>              Caffe model (default = no model, random weights used)" << std::endl;
+    // clang-format on
+}
+
+void UffInput::help(std::ostream& os)
+{
+    // clang-format off
+    os << "  --uffInput=<name>,X,Y,Z     Input blob name and its dimensions (X,Y,Z=C,H,W), it can be specified "
+                                                       "multiple times; at least one is required for UFF models" << std::endl <<
+          "  --uffNHWC                   Set if inputs are in the NHWC layout instead of NCHW (use "             <<
+                                                                    "X,Y,Z=H,W,C order in --uffInput)"           << std::endl;
+    // clang-format on
+}
+
+void ModelOptions::help(std::ostream& os)
+{
+    // clang-format off
+    os << "=== Model Options ==="                                                                                 << std::endl;
+    BaseModelOptions::help(os);
+    os << "  --deploy=<file>             Caffe prototxt file"                                                     << std::endl <<
+          "  --output=<name>[,<name>]*   Output names (it can be specified multiple times); at least one output "
+                                                                                  "is required for UFF and Caffe" << std::endl;
+    UffInput::help(os);
+    // clang-format on
+}
+
+void BuildOptions::help(std::ostream& os)
+{
+    // clang-format off
+    os << "=== Build Options ==="                                                                                                     << std::endl <<
+
+          "  --maxBatch                  Set max batch size and build an implicit batch engine (default = " << defaultMaxBatch << ")" << std::endl <<
+          "  --explicitBatch             Use explicit batch sizes when building the engine (default = implicit)"                      << std::endl <<
+          "  --minShapes=spec            Build with dynamic shapes using a profile with the min shapes provided"                      << std::endl <<
+          "  --optShapes=spec            Build with dynamic shapes using a profile with the opt shapes provided"                      << std::endl <<
+          "  --maxShapes=spec            Build with dynamic shapes using a profile with the max shapes provided"                      << std::endl <<
+          "                              Note: if any of min/max/opt is missing, the profile will be completed using the shapes "     << std::endl <<
+          "                                    provided and assuming that opt will be equal to max unless they are both specified;"   << std::endl <<           
+          "                                    partially specified shapes are applied starting from the batch size;"                  << std::endl <<           
+          "                                    dynamic shapes imply explicit batch"                                                   << std::endl <<           
+          "                              Input shapes spec ::= Ishp[\",\"spec]"                                                       << std::endl <<
+          "                                           Ishp ::= name\":\"shape"                                                        << std::endl <<
+          "                                          shape ::= N[[\"x\"N]*\"*\"]"                                                     << std::endl <<
+          "  --inputIOFormats=spec       Type and formats of the input tensors (default = all inputs in fp32:chw)"                    << std::endl <<
+          "  --outputIOFormats=spec      Type and formats of the output tensors (default = all outputs in fp32:chw)"                  << std::endl <<
+          "                              IO Formats: spec  ::= IOfmt[\",\"spec]"                                                      << std::endl <<
+          "                                          IOfmt ::= type:fmt"                                                              << std::endl <<
+          "                                          type  ::= \"fp32\"|\"fp16\"|\"int32\"|\"int8\""                                  << std::endl <<
+          "                                          fmt   ::= (\"chw\"|\"chw2\"|\"chw4\"|\"hwc8\"|\"chw16\"|\"chw32\")[\"+\"fmt]"    << std::endl <<
+          "  --workspace=N               Set workspace size in megabytes (default = "                      << defaultWorkspace << ")" << std::endl <<
+          "  --minTiming=M               Set the minimum number of iterations used in kernel selection (default = "
+                                                                                                           << defaultMinTiming << ")" << std::endl <<
+          "  --avgTiming=M               Set the number of times averaged in each iteration for kernel selection (default = "
+                                                                                                           << defaultAvgTiming << ")" << std::endl <<
+          "  --fp16                      Enable fp16 mode (default = disabled)"                                                       << std::endl <<
+          "  --int8                      Run in int8 mode (default = disabled)"                                                       << std::endl <<
+          "  --calib=<file>              Read INT8 calibration cache file"                                                            << std::endl <<
+          "  --safe                      Only test the functionality available in safety restricted flows"                            << std::endl <<
+          "  --saveEngine=<file>         Save the serialized engine"                                                                  << std::endl <<
+          "  --loadEngine=<file>         Load a serialized engine"                                                                    << std::endl;
+    // clang-format on
+}
+
+void SystemOptions::help(std::ostream& os)
+{
+    // clang-format off
+    os << "=== System Options ==="                                                                         << std::endl <<
+          "  --device=N                  Select cuda device N (default = "         << defaultDevice << ")" << std::endl <<
+          "  --useDLACore=N              Select DLA core N for layers that support DLA (default = none)"   << std::endl <<
+          "  --allowGPUFallback          When DLA is enabled, allow GPU fallback for unsupported layers "
+                                                                                    "(default = disabled)" << std::endl;
+    os << "  --plugins                   Plugin library (.so) to load (can be specified multiple times)"   << std::endl;
+    // clang-format on
+}
+
+void InferenceOptions::help(std::ostream& os)
+{
+    // clang-format off
+    os << "=== Inference Options ==="                                                                                            << std::endl <<
+          "  --batch=N                   Set batch size for implicit batch engines (default = "           << defaultBatch << ")" << std::endl <<
+          "  --shapes=spec               Set input shapes for explicit batch and dynamic shapes inputs"                          << std::endl <<
+          "                              Input shapes spec ::= Ishp[\",\"spec]"                                                  << std::endl <<
+          "                                           Ishp ::= name\":\"shape"                                                   << std::endl <<
+          "                                          shape ::= N[[\"x\"N]*\"*\"]"                                                << std::endl <<
+          "  --iterations=N              Run at least N inference iterations (default = "            << defaultIterations << ")" << std::endl <<
+          "  --warmUp=N                  Run for N milliseconds to warmup before measuring performance (default = "
+                                                                                                         << defaultWarmUp << ")" << std::endl <<
+          "  --duration=N                Run performance measurements for at least N seconds wallclock time (default = "
+                                                                                               << defaultDuration << ")"         << std::endl <<
+          "  --sleepTime=N               Delay inference start with a gap of N milliseconds between launch and compute "
+                                                                                            "(default = " << defaultSleep << ")" << std::endl <<
+          "  --streams=N                 Instantiate N engines to use concurrently (default = "         << defaultStreams << ")" << std::endl <<
+          "  --threads                   Enable multithreading to drive engines with independent threads (default = disabled)"   << std::endl <<
+          "  --useCudaGraph              Use cuda graph to capture engine execution and then launch inference (default = false)" << std::endl <<
+          "  --buildOnly                 Skip inference perf measurement (default = disabled)"                                   << std::endl;
+    // clang-format on
+}
+
+void ReportingOptions::help(std::ostream& os)
+{
+    // clang-format off
+    os << "=== Reporting Options ==="                                                                    << std::endl <<
+          "  --verbose                   Use verbose logging (default = false)"                          << std::endl <<
+          "  --avgRuns=N                 Report performance measurements averaged over N consecutive "
+                                                       "iterations (default = " << defaultAvgRuns << ")" << std::endl <<
+          "  --percentile=P              Report performance for the P percentage (0<=P<=100, 0 "
+                                        "representing max perf, and 100 representing min perf; (default"
+                                                                      " = " << defaultPercentile << "%)" << std::endl <<
+          "  --dumpOutput                Print the output tensor(s) of the last inference iteration "
+                                                                                  "(default = disabled)" << std::endl <<
+          "  --dumpProfile               Print profile information per layer (default = disabled)"       << std::endl <<
+          "  --exportTimes=<file>        Write the timing results in a json file (default = disabled)"   << std::endl <<
+          "  --exportProfile=<file>      Write the profile information per layer in a json file "
+                                                                              "(default = disabled)"     << std::endl;
+    // clang-format on
+}
+
+void helpHelp(std::ostream& os)
+{
+    os << "=== Help ===" << std::endl << "  --help                      Print this message" << std::endl;
+}
+
+void AllOptions::help(std::ostream& os)
+{
+    ModelOptions::help(os);
+    os << std::endl;
+    BuildOptions::help(os);
+    os << std::endl;
+    InferenceOptions::help(os);
+    os << std::endl;
+    // clang-format off
+    os << "=== Build and Inference Batch Options ==="                                                                   << std::endl <<
+          "                              When using implicit batch, the max batch size of the engine, if not given, "   << std::endl <<
+          "                              is set to the inference batch size;"                                           << std::endl <<
+          "                              when using explicit batch, if shapes are specified only for inference, they "  << std::endl <<
+          "                              will be used also as min/opt/max in the build profile; if shapes are "         << std::endl <<
+          "                              specified only for the build, the opt shapes will be used also for inference;" << std::endl <<
+          "                              if both are specified, they must be compatible; and if explicit batch is "     << std::endl <<
+          "                              enabled but neither is specified, the model must provide complete static"      << std::endl <<
+          "                              dimensions, including batch size, for all inputs"                              << std::endl <<
+    std::endl;
+    // clang-format on
+    ReportingOptions::help(os);
+    os << std::endl;
+    SystemOptions::help(os);
+    os << std::endl;
+    helpHelp(os);
+}
+
+} // namespace sample
diff --git a/samples/common/sampleOptions.h b/samples/common/sampleOptions.h
new file mode 100644
index 00000000..83e6c825
--- /dev/null
+++ b/samples/common/sampleOptions.h
@@ -0,0 +1,367 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TRT_SAMPLE_OPTIONS_H
+#define TRT_SAMPLE_OPTIONS_H
+
+#include <algorithm>
+#include <array>
+#include <iostream>
+#include <stdexcept>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "NvInfer.h"
+
+namespace sample
+{
+
+// Build default params
+constexpr int defaultMaxBatch{1};
+constexpr int defaultWorkspace{16};
+constexpr int defaultMinTiming{1};
+constexpr int defaultAvgTiming{8};
+
+// System default params
+constexpr int defaultDevice{0};
+
+// Inference default params
+constexpr int defaultBatch{1};
+constexpr int defaultStreams{1};
+constexpr int defaultIterations{10};
+constexpr int defaultWarmUp{200};
+constexpr int defaultDuration{10};
+constexpr int defaultSleep{0};
+
+// Reporting default params
+constexpr int defaultAvgRuns{10};
+constexpr float defaultPercentile{99};
+
+enum class ModelFormat
+{
+    kANY,
+    kCAFFE,
+    kONNX,
+    kUFF
+};
+
+using Arguments = std::unordered_multimap<std::string, std::string>;
+
+using IOFormat = std::pair<nvinfer1::DataType, nvinfer1::TensorFormats>;
+
+using ShapeRange = std::array<nvinfer1::Dims, nvinfer1::EnumMax<nvinfer1::OptProfileSelector>()>;
+
+struct Options
+{
+    virtual void parse(Arguments& arguments) = 0;
+};
+
+struct BaseModelOptions : public Options
+{
+    ModelFormat format{ModelFormat::kANY};
+    std::string model;
+
+    void parse(Arguments& arguments) override;
+
+    static void help(std::ostream& out);
+};
+
+struct UffInput : public Options
+{
+    std::vector<std::pair<std::string, nvinfer1::Dims>> inputs;
+    bool NHWC{false};
+
+    void parse(Arguments& arguments) override;
+
+    static void help(std::ostream& out);
+};
+
+struct ModelOptions : public Options
+{
+    BaseModelOptions baseModel;
+    std::string prototxt;
+    std::vector<std::string> outputs;
+    UffInput uffInputs;
+
+    void parse(Arguments& arguments) override;
+
+    static void help(std::ostream& out);
+};
+
+struct BuildOptions : public Options
+{
+    // bool explicitBatch{false};
+    int maxBatch{defaultMaxBatch}; // Parsing sets maxBatch to 0 if explicitBatch is true
+    int workspace{defaultWorkspace};
+    int minTiming{defaultMinTiming};
+    int avgTiming{defaultAvgTiming};
+    bool fp16{false};
+    bool int8{false};
+    bool safe{false};
+    bool save{false};
+    bool load{false};
+    std::string engine;
+    std::string calibration;
+    std::unordered_map<std::string, ShapeRange> shapes;
+    std::vector<IOFormat> inputFormats;
+    std::vector<IOFormat> outputFormats;
+
+    void parse(Arguments& arguments) override;
+
+    static void help(std::ostream& out);
+};
+
+struct SystemOptions : public Options
+{
+    int device{defaultDevice};
+    int DLACore{-1};
+    bool fallback{false};
+    std::vector<std::string> plugins;
+
+    void parse(Arguments& arguments) override;
+
+    static void help(std::ostream& out);
+};
+
+struct InferenceOptions : public Options
+{
+    int batch{defaultBatch}; // Parsing sets batch to 0 is shapes is not empty
+    int iterations{defaultIterations};
+    int warmup{defaultWarmUp};
+    int duration{defaultDuration};
+    int sleep{defaultSleep};
+    int streams{defaultStreams};
+    bool threads{true};
+    bool graph{false};
+    bool skip{false};
+    std::unordered_map<std::string, nvinfer1::Dims> shapes;
+
+    void parse(Arguments& arguments) override;
+
+    static void help(std::ostream& out);
+};
+
+struct ReportingOptions : public Options
+{
+    bool verbose{false};
+    int avgs{defaultAvgRuns};
+    float percentile{defaultPercentile};
+    bool output{false};
+    bool profile{false};
+    std::string exportTimes{};
+    std::string exportProfile{};
+
+    void parse(Arguments& arguments) override;
+
+    static void help(std::ostream& out);
+};
+
+struct AllOptions : public Options
+{
+    ModelOptions model;
+    BuildOptions build;
+    SystemOptions system;
+    InferenceOptions inference;
+    ReportingOptions reporting;
+    bool helps{false};
+
+    void parse(Arguments& arguments) override;
+
+    static void help(std::ostream& out);
+};
+
+Arguments argsToArgumentsMap(int argc, char* argv[]);
+
+bool parseHelp(Arguments& arguments);
+
+void helpHelp(std::ostream& out);
+
+// Functions to print options
+
+std::ostream& operator<<(std::ostream& os, const BaseModelOptions& options);
+
+std::ostream& operator<<(std::ostream& os, const UffInput& input);
+
+std::ostream& operator<<(std::ostream& os, const IOFormat& format);
+
+std::ostream& operator<<(std::ostream& os, const nvinfer1::Dims& dims);
+
+std::ostream& operator<<(std::ostream& os, const ShapeRange& dims);
+
+std::ostream& operator<<(std::ostream& os, const ModelOptions& options);
+
+std::ostream& operator<<(std::ostream& os, const BuildOptions& options);
+
+std::ostream& operator<<(std::ostream& os, const SystemOptions& options);
+
+std::ostream& operator<<(std::ostream& os, const InferenceOptions& options);
+
+std::ostream& operator<<(std::ostream& os, const ReportingOptions& options);
+
+std::ostream& operator<<(std::ostream& os, const AllOptions& options);
+
+// Utils to extract options
+
+inline std::vector<std::string> splitToStringVec(const std::string& option, char separator)
+{
+    std::vector<std::string> options;
+
+    for (size_t start = 0; start < option.length();)
+    {
+        size_t separatorIndex = option.find(separator, start);
+        if (separatorIndex == std::string::npos)
+        {
+            separatorIndex = option.length();
+        }
+        options.emplace_back(option.substr(start, separatorIndex - start));
+        start = separatorIndex + 1;
+    }
+
+    return options;
+}
+
+template <typename T>
+inline T stringToValue(const std::string& option)
+{
+    return T{option};
+}
+
+template <>
+inline int stringToValue<int>(const std::string& option)
+{
+    return std::stoi(option);
+}
+
+template <>
+inline float stringToValue<float>(const std::string& option)
+{
+    return std::stof(option);
+}
+
+template <>
+inline bool stringToValue<bool>(const std::string& option)
+{
+    return true;
+}
+
+template <>
+inline nvinfer1::Dims stringToValue<nvinfer1::Dims>(const std::string& option)
+{
+    nvinfer1::Dims dims;
+    dims.nbDims = 0;
+    std::vector<std::string> dimsStrings = splitToStringVec(option, 'x');
+    for (const auto& d : dimsStrings)
+    {
+        if (d == "*")
+        {
+            break;
+        }
+        dims.d[dims.nbDims] = stringToValue<int>(d);
+        ++dims.nbDims;
+    }
+    return dims;
+}
+
+template <>
+inline nvinfer1::DataType stringToValue<nvinfer1::DataType>(const std::string& option)
+{
+    const std::unordered_map<std::string, nvinfer1::DataType> strToDT{{"fp32", nvinfer1::DataType::kFLOAT},
+        {"fp16", nvinfer1::DataType::kHALF}, {"int8", nvinfer1::DataType::kINT8},
+        {"int32", nvinfer1::DataType::kINT32}};
+    auto dt = strToDT.find(option);
+    if (dt == strToDT.end())
+    {
+        throw std::invalid_argument("Invalid DataType " + option);
+    }
+    return dt->second;
+}
+
+template <>
+inline nvinfer1::TensorFormats stringToValue<nvinfer1::TensorFormats>(const std::string& option)
+{
+    std::vector<std::string> optionStrings = splitToStringVec(option, '+');
+    const std::unordered_map<std::string, nvinfer1::TensorFormat> strToFmt{{"chw", nvinfer1::TensorFormat::kLINEAR},
+        {"chw2", nvinfer1::TensorFormat::kCHW2}, {"chw4", nvinfer1::TensorFormat::kCHW4},
+        {"hwc8", nvinfer1::TensorFormat::kHWC8}, {"chw16", nvinfer1::TensorFormat::kCHW16},
+        {"chw32", nvinfer1::TensorFormat::kCHW32}};
+    nvinfer1::TensorFormats formats{};
+    for (auto f : optionStrings)
+    {
+        auto tf = strToFmt.find(f);
+        if (tf == strToFmt.end())
+        {
+            throw std::invalid_argument(std::string("Invalid TensorFormat ") + f);
+        }
+        formats |= 1U << int(tf->second);
+    }
+
+    return formats;
+}
+
+template <>
+inline IOFormat stringToValue<IOFormat>(const std::string& option)
+{
+    IOFormat ioFormat{};
+    size_t colon = option.find(':');
+
+    if (colon == std::string::npos)
+    {
+        throw std::invalid_argument(std::string("Invalid IOFormat ") + option);
+    }
+    ioFormat.first = stringToValue<nvinfer1::DataType>(option.substr(0, colon));
+    ioFormat.second = stringToValue<nvinfer1::TensorFormats>(option.substr(colon + 1));
+
+    return ioFormat;
+}
+
+inline const char* boolToEnabled(bool enable)
+{
+    return enable ? "Enabled" : "Disabled";
+}
+
+template <typename T>
+inline bool checkEraseOption(Arguments& arguments, const std::string& option, T& value)
+{
+    auto match = arguments.find(option);
+    if (match != arguments.end())
+    {
+        value = stringToValue<T>(match->second);
+        arguments.erase(match);
+        return true;
+    }
+
+    return false;
+}
+
+template <typename T>
+inline bool checkEraseRepeatedOption(Arguments& arguments, const std::string& option, std::vector<T>& values)
+{
+    auto match = arguments.equal_range(option);
+    if (match.first == match.second)
+    {
+        return false;
+    }
+    auto addValue = [&values](Arguments::value_type& value) { values.emplace_back(stringToValue<T>(value.second)); };
+    std::for_each(match.first, match.second, addValue);
+    arguments.erase(match.first, match.second);
+    return true;
+}
+
+} // namespace sample
+
+#endif // TRT_SAMPLES_OPTIONS_H
diff --git a/samples/common/sampleUtils.h b/samples/common/sampleUtils.h
new file mode 100644
index 00000000..5678e08a
--- /dev/null
+++ b/samples/common/sampleUtils.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TRT_SAMPLE_UTILS_H
+#define TRT_SAMPLE_UTILS_H
+
+#include <cuda_runtime.h>
+#include <iostream>
+#include <memory>
+#include <numeric>
+
+#include "NvInfer.h"
+
+namespace sample
+{
+
+inline void cudaCheck(cudaError_t ret, std::ostream& err = std::cerr)
+{
+    if (ret != cudaSuccess)
+    {
+        err << "Cuda failure: " << ret << std::endl;
+        abort();
+    }
+}
+
+template <typename T>
+struct destroyer
+{
+    void operator()(T* t)
+    {
+        t->destroy();
+    }
+};
+
+template <typename T>
+using unique_ptr = std::unique_ptr<T, destroyer<T>>;
+
+inline int64_t volume(const nvinfer1::Dims& d)
+{
+    return std::accumulate(d.d, d.d + d.nbDims, 1, std::multiplies<int64_t>());
+}
+
+} // namespace sample
+
+#endif // TRT_SAMPLE_UTILS_H
diff --git a/samples/common/windows/getopt.c b/samples/common/windows/getopt.c
index c0794138..7a4b10ab 100644
--- a/samples/common/windows/getopt.c
+++ b/samples/common/windows/getopt.c
@@ -66,7 +66,6 @@
  * limitations under the License.
  */
 
-
 #include <errno.h>
 #include <getopt.h>
 #include <stdarg.h>
diff --git a/samples/opensource/CMakeLists.txt b/samples/opensource/CMakeLists.txt
index f8405c9e..8184515a 100644
--- a/samples/opensource/CMakeLists.txt
+++ b/samples/opensource/CMakeLists.txt
@@ -15,6 +15,7 @@
 #
 set(OPENSOURCE_SAMPLES_LIST
     sampleCharRNN
+    sampleDynamicReshape
     sampleFasterRCNN
     sampleGoogleNet
     sampleINT8
@@ -23,10 +24,16 @@ set(OPENSOURCE_SAMPLES_LIST
     sampleMNIST
     sampleMNISTAPI
     sampleMovieLens
+    sampleMovieLensMPS
+    sampleNMT
     sampleOnnxMNIST
     samplePlugin
+    sampleReformatFreeIO
     sampleSSD
+    sampleUffFasterRCNN
+    sampleUffMaskRCNN
     sampleUffMNIST
+    sampleUffPluginV2Ext
     sampleUffSSD
     trtexec
 )
diff --git a/samples/opensource/sampleCharRNN/README.md b/samples/opensource/sampleCharRNN/README.md
index abe6d9a2..873feca3 100644
--- a/samples/opensource/sampleCharRNN/README.md
+++ b/samples/opensource/sampleCharRNN/README.md
@@ -87,13 +87,12 @@ If you want to train your own model and not use the pre-trained model included i
 
 To see the full list of available options and their descriptions, use the `-h` or `--help` command line option. The following example output is printed when running the sample:
 ```
-Usage: ./sample_char_rnn [-h or --help] [-d or --datadir=<path_to_data_directory>] [--useDLACore=<int>]
+Usage: ./sample_char_rnn [-h or --help] [-d or --datadir=<path_to_data_directory>]
 
 --help Display help information
 
 --datadir Specify path to a data directory, overriding the default. This option can be used multiple times to add multiple directories. If no data directories are given, the default is to use data/samples/char-rnn/ and data/char-rnn/
 
---useDLACore=N Specify a DLA engine for layers that support DLA. Value can range from 0 to n-1, where n is the number of DLA engines on the platform.
 ```
 
 
diff --git a/samples/opensource/sampleCharRNN/sampleCharRNN.cpp b/samples/opensource/sampleCharRNN/sampleCharRNN.cpp
index 3e0e7a64..295ca713 100644
--- a/samples/opensource/sampleCharRNN/sampleCharRNN.cpp
+++ b/samples/opensource/sampleCharRNN/sampleCharRNN.cpp
@@ -20,7 +20,7 @@
 //! It uses weights from a trained TensorFlow model and creates the network
 //! using the TensorRT network definition API
 //! It can be run with the following command line:
-//! Command: ./sample_char_rnn [-h or --help] [-d or --datadir=<path to data directory>] [--useDLACore=<int>]
+//! Command: ./sample_char_rnn [-h or --help] [-d or --datadir=<path to data directory>]
 //!
 
 #include <algorithm>
@@ -181,8 +181,8 @@ class SampleCharRNN
     //!
     //! \brief Create full model using the TensorRT network definition API and build the engine.
     //!
-    void constructNetwork(
-        SampleUniquePtr<nvinfer1::IBuilder>& builder, SampleUniquePtr<nvinfer1::INetworkDefinition>& network);
+    void constructNetwork(SampleUniquePtr<nvinfer1::IBuilder>& builder,
+        SampleUniquePtr<nvinfer1::INetworkDefinition>& network, SampleUniquePtr<nvinfer1::IBuilderConfig>& config);
 
     //!
     //! \brief Looks up the embedding tensor for a given char and copies it to input buffer
@@ -228,15 +228,19 @@ bool SampleCharRNN::build()
     {
         return false;
     }
+    auto config = SampleUniquePtr<nvinfer1::IBuilderConfig>(builder->createBuilderConfig());
+    if (!config)
+    {
+        return false;
+    }
 
     mWeightMap = SampleCharRNN::loadWeights(mParams.weightFileName);
 
     builder->setMaxBatchSize(mParams.batchSize);
-    builder->setMaxWorkspaceSize(32_MB);
-    builder->allowGPUFallback(true);
-    samplesCommon::enableDLA(builder.get(), mParams.dlaCore);
+    config->setMaxWorkspaceSize(32_MiB);
+    config->setFlag(BuilderFlag::kGPU_FALLBACK);
 
-    constructNetwork(builder, network);
+    constructNetwork(builder, network, config);
 
     if (!mEngine)
     {
@@ -274,7 +278,9 @@ std::map<std::string, nvinfer1::Weights> SampleCharRNN::loadWeights(const std::s
     while (count--)
     {
         if (mParams.weightNames.names.empty())
+        {
             break;
+        }
 
         nvinfer1::Weights wt{nvinfer1::DataType::kFLOAT, nullptr, 0};
 
@@ -413,7 +419,9 @@ nvinfer1::IRNNv2Layer* SampleCharRNN::addRNNv2Layer(SampleUniquePtr<nvinfer1::IN
     rnn->getOutput(0)->setName("RNN output");
     rnn->setHiddenState(*hiddenIn);
     if (rnn->getOperation() == nvinfer1::RNNOperation::kLSTM)
+    {
         rnn->setCellState(*cellIn);
+    }
 
     // Specify sequence lengths.  Note this can be omitted since we are always using the maximum
     // sequence length, but for illustrative purposes we explicitly pass in sequence length data
@@ -469,8 +477,8 @@ nvinfer1::IRNNv2Layer* SampleCharRNN::addRNNv2Layer(SampleUniquePtr<nvinfer1::IN
 //! \param weightMap Map that contains all the weights required by the model.
 //! \param modelStream The stream within which the engine is serialized once built.
 //!
-void SampleCharRNN::constructNetwork(
-    SampleUniquePtr<nvinfer1::IBuilder>& builder, SampleUniquePtr<nvinfer1::INetworkDefinition>& network)
+void SampleCharRNN::constructNetwork(SampleUniquePtr<nvinfer1::IBuilder>& builder,
+    SampleUniquePtr<nvinfer1::INetworkDefinition>& network, SampleUniquePtr<nvinfer1::IBuilderConfig>& config)
 {
     // add RNNv2 layer and set its parameters
     auto rnn = SampleCharRNN::addRNNv2Layer(network);
@@ -514,7 +522,8 @@ void SampleCharRNN::constructNetwork(
 
     gLogInfo << "Done constructing network..." << std::endl;
 
-    mEngine = std::shared_ptr<nvinfer1::ICudaEngine>(builder->buildCudaEngine(*network), samplesCommon::InferDeleter());
+    mEngine = std::shared_ptr<nvinfer1::ICudaEngine>(
+        builder->buildEngineWithConfig(*network, *config), samplesCommon::InferDeleter());
 }
 
 //!
@@ -696,8 +705,6 @@ SampleCharRNNParams initializeSampleParams(const samplesCommon::Args& args)
     params.vocabSize = 65;
     params.outputSize = 1;
     params.weightFileName = locateFile("char-rnn.wts", params.dataDirs);
-    params.dlaCore = args.useDLACore;
-    params.fp16 = args.runInFp16;
 
     // Input strings and their respective expected output strings
     const std::vector<std::string> inS{
@@ -736,16 +743,12 @@ SampleCharRNNParams initializeSampleParams(const samplesCommon::Args& args)
 //!
 void printHelpInfo()
 {
-    std::cout
-        << "Usage: ./sample_char_rnn [-h or --help] [-d or --datadir=<path to data directory>] [--useDLACore=<int>]\n";
+    std::cout << "Usage: ./sample_char_rnn [-h or --help] [-d or --datadir=<path to data directory>]\n";
     std::cout << "--help          Display help information\n";
     std::cout << "--datadir       Specify path to a data directory, overriding the default. This option can be used "
                  "multiple times to add multiple directories. If no data directories are given, the default is to use "
                  "data/samples/char-rnn/ and data/char-rnn/"
               << std::endl;
-    std::cout << "--useDLACore=N  Specify a DLA engine for layers that support DLA. Value can range from 0 to n-1, "
-                 "where n is the number of DLA engines on the platform."
-              << std::endl;
 }
 
 //!
@@ -753,6 +756,7 @@ void printHelpInfo()
 //!
 int main(int argc, char** argv)
 {
+    setReportableSeverity(Logger::Severity::kVERBOSE);
     samplesCommon::Args args;
     bool argsOK = samplesCommon::parseArgs(args, argc, argv);
     if (!argsOK)
diff --git a/samples/opensource/sampleDynamicReshape/CMakeLists.txt b/samples/opensource/sampleDynamicReshape/CMakeLists.txt
new file mode 100644
index 00000000..3651053b
--- /dev/null
+++ b/samples/opensource/sampleDynamicReshape/CMakeLists.txt
@@ -0,0 +1,20 @@
+#
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+SET(SAMPLE_SOURCES
+    sampleDynamicReshape.cpp
+)
+set(SAMPLE_PARSERS "onnx")
+include(../../CMakeSamplesTemplate.txt)
diff --git a/samples/opensource/sampleDynamicReshape/README.md b/samples/opensource/sampleDynamicReshape/README.md
new file mode 100644
index 00000000..66697eb3
--- /dev/null
+++ b/samples/opensource/sampleDynamicReshape/README.md
@@ -0,0 +1,233 @@
+# Digit Recognition With Dynamic Shapes In TensorRT
+
+
+**Table Of Contents**
+- [Description](#description)
+- [How does this sample work?](#how-does-this-sample-work)
+    * [Creating the preprocessing network](#creating-the-preprocessing-network)
+    * [Parsing the ONNX MNIST model](#parsing-the-onnx-mnist-model)
+    * [Building engines](#building-engines)
+    * [Running inference](#running-inference)
+	* [TensorRT API layers and ops](#tensorrt-api-layers-and-ops)
+- [Running the sample](#running-the-sample)
+	* [Sample `--help` options](#sample---help-options)
+- [Additional resources](#additional-resources)
+- [License](#license)
+- [Changelog](#changelog)
+- [Known issues](#known-issues)
+
+## Description
+
+This sample, sampleDynamicReshape, demonstrates how to use dynamic input dimensions in TensorRT. It creates an engine that takes a dynamically shaped input and resizes it to be consumed by an ONNX MNIST model that expects a fixed size input. For more information, see [Working With Dynamic Shapes](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#work_dynamic_shapes) in the TensorRT Developer Guide.
+
+## How does this sample work?
+
+This sample creates an engine for resizing an input with dynamic dimensions to a size that an ONNX MNIST model can consume.
+
+Specifically, this sample:
+-   Creates a network with dynamic input dimensions to act as a preprocessor for the model
+-   Parses an ONNX MNIST model to create a second network
+-   Builds engines for both networks
+-   Runs inference using both engines
+
+### Creating the preprocessing network
+
+First, create a network with full dims support:
+`auto preprocessorNetwork = this->makeUnique(builder->createNetworkV2(1U << static_cast<int32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH)));`
+
+Next, add an input layer that accepts an input with a dynamic shape, followed by a resize layer that will reshape the input to the shape the model expects:
+```
+auto input = preprocessorNetwork->addInput("input", nvinfer1::DataType::kFLOAT, Dims3{1, -1, -1});
+auto resizeLayer = preprocessorNetwork->addResize(*input);
+resizeLayer->setOutputDimensions(mPredictionInputDims);
+preprocessorNetwork->markOutput(*resizeLayer->getOutput(0));
+```
+
+The -1 dimensions denote dimensions that will be supplied at runtime.
+
+### Parsing the ONNX MNIST model
+
+First, create an empty network, and parser:
+```
+auto network = this->makeUnique(builder->createNetwork());
+auto parser = nvonnxparser::createParser(*network, gLogger.getTRTLogger());
+```
+
+Next, parse the model file to populate the network:
+```
+parser->parseFromFile(locateFile(mParams.onnxFileName, mParams.dataDirs).c_str(), static_cast<int>(gLogger.getReportableSeverity()));
+```
+
+### Building engines
+
+When building the preprocessor engine, also provide an optimization profile so that TensorRT knows which input shapes to optimize for:
+```
+auto preprocessorConfig = this->makeUnique(builder->createNetworkConfig());
+auto profile = builder->createOptimizationProfile();
+```
+
+`OptProfileSelector::kOPT` specifies the dimensions that the profile will be optimized for, whereas `OptProfileSelector::kMIN` and `OptProfileSelector::kMAX` specify the minimum and maximum dimensions for which the profile will be valid:
+```
+profile->setDimensions(input->getName(), OptProfileSelector::kMIN, Dims3{1, 1, 1});
+profile->setDimensions(input->getName(), OptProfileSelector::kOPT, Dims3{1, 28, 28});
+profile->setDimensions(input->getName(), OptProfileSelector::kMAX, Dims3{1, 56, 56});
+preprocessorConfig->addOptimizationProfile(profile);
+mPreprocessorEngine = this->makeUnique(builder->buildEngineWithConfig(*preprocessorNetwork, *preprocessorConfig));
+```
+
+For the MNIST model, attach a Softmax layer to the end of the network and replace the existing network output with the Softmax:
+```
+auto softmax = network->addSoftMax(*network->getOutput(0));
+network->unmarkOutput(*network->getOutput(0));
+network->markOutput(*softmax->getOutput(0));
+```
+
+Finally, build as normal:
+`mPredictionEngine = this->makeUnique(builder->buildEngineWithConfig(*network, *config));`
+
+### Running inference
+
+During inference, first copy the input buffer to the device:
+```
+CHECK(cudaMemcpy(mInput.deviceBuffer.data(), mInput.hostBuffer.data(), mInput.hostBuffer.nbBytes(), cudaMemcpyHostToDevice));
+```
+
+Since the preprocessor engine accepts dynamic shapes, specify the actual shape of the current input to the execution context:
+`mPreprocessorContext->setBindingDimensions(0, inputDims);`
+
+Next, run the preprocessor using the `executeV2` function. The example writes the output of the preprocessor engine directly to the input device buffer of the MNIST engine:
+```
+std::vector<void*> preprocessorBindings = {mInput.deviceBuffer.data(), mPredictionInput.data()};
+bool status = mPreprocessorContext->executeV2(preprocessorBindings.data());
+```
+
+Then, run the MNIST engine:
+```
+std::vector<void*> predicitonBindings = {mPredictionInput.data(), mOutput.deviceBuffer.data()};
+status = mPredictionContext->execute(mParams.batchSize, predicitonBindings.data());
+```
+
+Finally, copy the output back to the host:
+```
+CHECK(cudaMemcpy(mOutput.hostBuffer.data(), mOutput.deviceBuffer.data(), mOutput.deviceBuffer.nbBytes(), cudaMemcpyDeviceToHost));
+```
+
+### TensorRT API layers and ops
+
+In this sample, the following layers are used. For more information about these layers, see the [TensorRT Developer Guide: Layers](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#layers) documentation.
+
+[Resize layer](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#resize-layer)
+The IResizeLayer implements the resize operation on an input tensor.
+
+## Running the sample
+
+1.  Compile this sample by running `make` in the `<TensorRT root directory>/samples/sampleDynamicReshape` directory. The binary named `sample_dynamic_reshape` will be created in the `<TensorRT root directory>/bin` directory.
+	```
+	cd <TensorRT root directory>/samples/sampleDynamicReshape
+	make
+	```
+
+	Where `<TensorRT root directory>` is where you installed TensorRT.
+
+2.  Run the sample.
+	```
+	./sample_dynamic_reshape [-h or --help] [-d or --datadir=<path to data directory>] [--useDLACore=<int>] [--int8 or --fp16]
+	```
+
+3. Verify that the sample ran successfully. If the sample runs successfully you should see output similar to the following:
+	```
+	&&&& RUNNING TensorRT.sample_dynamic_reshape # ./sample_dynamic_reshape
+	----------------------------------------------------------------
+	Input filename: ../../../../../../data/samples/mnist/mnist.onnx
+	ONNX IR version: 0.0.3
+	Opset version: 1
+	Producer name: CNTK
+	Producer version: 2.4
+	Domain:
+	Model version: 1
+	Doc string:
+	----------------------------------------------------------------
+	[I] Input:
+	@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+	@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+	@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+	@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+	@@@@@@@@@@@*. .*@@@@@@@@@@@
+	@@@@@@@@@@*. +@@@@@@@@@@
+	@@@@@@@@@@. :#+ %@@@@@@@@@
+	@@@@@@@@@@.:@@@+ +@@@@@@@@@
+	@@@@@@@@@@.:@@@@: +@@@@@@@@
+	@@@@@@@@@@=%@@@@: +@@@@@@@@
+	@@@@@@@@@@@@@@@@# +@@@@@@@@
+	@@@@@@@@@@@@@@@@* +@@@@@@@@
+	@@@@@@@@@@@@@@@@: +@@@@@@@@
+	@@@@@@@@@@@@@@@@: +@@@@@@@@
+	@@@@@@@@@@@@@@@* .@@@@@@@@@
+	@@@@@@@@@@%**%@. *@@@@@@@@@
+	@@@@@@@@%+. .: .@@@@@@@@@@
+	@@@@@@@@= .. :@@@@@@@@@@
+	@@@@@@@@: *@@: :@@@@@@@@@@
+	@@@@@@@% %@* *@@@@@@@@@
+	@@@@@@@% ++ ++ .%@@@@@@@@
+	@@@@@@@@- +@@- +@@@@@@@@
+	@@@@@@@@= :*@@@# .%@@@@@@@
+	@@@@@@@@@+*@@@@@%. %@@@@@@
+	@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+	@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+	@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+	@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+	[I] Output:
+	Prob 0 0.0000 Class 0:
+	Prob 1 0.0000 Class 1:
+	Prob 2 1.0000 Class 2: **********
+	Prob 3 0.0000 Class 3:
+	Prob 4 0.0000 Class 4:
+	Prob 5 0.0000 Class 5:
+	Prob 6 0.0000 Class 6:
+	Prob 7 0.0000 Class 7:
+	Prob 8 0.0000 Class 8:
+	Prob 9 0.0000 Class 9:
+
+	&&&& PASSED TensorRT.sample_dynamic_reshape # ./sample_dynamic_reshape
+	```
+
+	This output shows that the sample ran successfully; `PASSED`.
+
+
+### Sample `--help` options
+
+To see the full list of available options and their descriptions, use the `-h` or `--help` command line option.
+
+
+# Additional resources
+
+The following resources provide a deeper understanding of dynamic shapes.
+
+**ONNX**
+- [GitHub: ONNX](https://github.com/onnx/onnx)
+- [GitHub: ONNX-TensorRT open source parser](https://github.com/onnx/onnx-tensorrt)
+
+**Models**
+- [MNIST - Handwritten Digit Recognition](https://github.com/onnx/models/tree/master/mnist)
+- [GitHub: ONNX Models](https://github.com/onnx/models)
+
+**Documentation**
+- [Introduction To NVIDIA’s TensorRT Samples](https://docs.nvidia.com/deeplearning/sdk/tensorrt-sample-support-guide/index.html#samples)
+- [Working With TensorRT Using The Python API](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#python_topics)
+- [NVIDIA’s TensorRT Documentation Library](https://docs.nvidia.com/deeplearning/sdk/tensorrt-archived/index.html)
+
+# License
+
+For terms and conditions for use, reproduction, and distribution, see the [TensorRT Software License Agreement](https://docs.nvidia.com/deeplearning/sdk/tensorrt-sla/index.html) documentation.
+
+
+# Changelog
+
+June 2019
+This is the first release of the `README.md` file and sample.
+
+
+# Known issues
+
+There are no known issues in this sample.
diff --git a/samples/opensource/sampleDynamicReshape/sampleDynamicReshape.cpp b/samples/opensource/sampleDynamicReshape/sampleDynamicReshape.cpp
new file mode 100644
index 00000000..ade17b9d
--- /dev/null
+++ b/samples/opensource/sampleDynamicReshape/sampleDynamicReshape.cpp
@@ -0,0 +1,393 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+//!
+//! sampleDynamicReshape.cpp
+//! This file contains the implementation of the dynamic reshape MNIST sample. It creates a network
+//! using the MNIST ONNX model, and uses a second engine to resize inputs to the shape the model
+//! expects.
+//! It can be run with the following command:
+//! Command: ./sample_dynamic_reshape [-h or --help [-d=/path/to/data/dir or --datadir=/path/to/data/dir]
+//!
+
+#include "argsParser.h"
+#include "buffers.h"
+#include "common.h"
+#include "logger.h"
+#include "parserOnnxConfig.h"
+
+#include "NvInfer.h"
+#include <cuda_runtime_api.h>
+#include <random>
+
+const std::string gSampleName = "TensorRT.sample_dynamic_reshape";
+
+//! \brief The SampleDynamicReshape class implementes the dynamic reshape sample.
+//!
+//! \details This class builds one engine that resizes a given input to the correct size, and a
+//! second engine based on an ONNX MNIST model that generates a prediction.
+//!
+class SampleDynamicReshape
+{
+    template <typename T>
+    using SampleUniquePtr = std::unique_ptr<T, samplesCommon::InferDeleter>;
+
+public:
+    SampleDynamicReshape(const samplesCommon::OnnxSampleParams& params)
+        : mParams(params)
+    {
+    }
+
+    //!
+    //! \brief Builds both engines.
+    //!
+    void build();
+
+    //!
+    //! \brief Prepares the model for inference by creating execution contexts and allocating buffers.
+    //!
+    void prepare();
+
+    //!
+    //! \brief Runs inference using TensorRT on a random image.
+    //!
+    bool infer();
+
+private:
+    void buildPreprocessorEngine(const SampleUniquePtr<nvinfer1::IBuilder>& builder);
+    void buildPredictionEngine(const SampleUniquePtr<nvinfer1::IBuilder>& builder);
+
+    Dims loadPGMFile(const std::string& fileName);
+    bool validateOutput(int digit);
+
+    samplesCommon::OnnxSampleParams mParams; //!< The parameters for the sample.
+
+    nvinfer1::Dims mPredictionInputDims;   //!< The dimensions of the input of the MNIST model.
+    nvinfer1::Dims mPredicitionOutputDims; //!< The dimensions of the output of the MNIST model.
+
+    // Engines used for inference. The first is used for resizing inputs, the second for prediction.
+    SampleUniquePtr<nvinfer1::ICudaEngine> mPreprocessorEngine{nullptr}, mPredictionEngine{nullptr};
+
+    SampleUniquePtr<nvinfer1::IExecutionContext> mPreprocessorContext{nullptr}, mPredictionContext{nullptr};
+
+    samplesCommon::ManagedBuffer mInput{};          //!< Host and device buffers for the input.
+    samplesCommon::DeviceBuffer mPredictionInput{}; //!< Device buffer for the output of the preprocessor, i.e. the
+                                                    //!< input to the prediction model.
+    samplesCommon::ManagedBuffer mOutput{};         //!< Host buffer for the ouptut
+
+    template <typename T>
+    SampleUniquePtr<T> makeUnique(T* t)
+    {
+        if (!t)
+        {
+            throw std::runtime_error{"Failed to create TensorRT object"};
+        }
+        return SampleUniquePtr<T>{t};
+    }
+};
+
+//!
+//! \brief Builds the two engines required for inference.
+//!
+//! \details This function creates one TensorRT engine for resizing inputs to the correct sizes,
+//!          then creates a TensorRT network by parsing the ONNX model and builds
+//!          an engine that will be used to run inference (mPredictionEngine).
+//!
+void SampleDynamicReshape::build()
+{
+    auto builder = this->makeUnique(nvinfer1::createInferBuilder(gLogger.getTRTLogger()));
+
+    // This function will also set mPredictionInputDims and mPredicitionOutputDims,
+    // so it needs to be called before building the preprocessor.
+    this->buildPredictionEngine(builder);
+    this->buildPreprocessorEngine(builder);
+}
+
+//!
+//! \brief Builds an engine for preprocessing (mPreprocessorEngine).
+//!
+void SampleDynamicReshape::buildPreprocessorEngine(const SampleUniquePtr<nvinfer1::IBuilder>& builder)
+{
+    // Create the preprocessor engine using a network that supports full dimensions (createNetworkV2).
+    auto preprocessorNetwork = this->makeUnique(
+        builder->createNetworkV2(1U << static_cast<uint32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH)));
+
+    // Reshape a dynamically shaped input to the size expected by the model, (1, 28, 28).
+    auto input = preprocessorNetwork->addInput("input", nvinfer1::DataType::kFLOAT, Dims3{1, -1, -1});
+    auto resizeLayer = preprocessorNetwork->addResize(*input);
+    resizeLayer->setOutputDimensions(mPredictionInputDims);
+    preprocessorNetwork->markOutput(*resizeLayer->getOutput(0));
+
+    // Finally, configure and build the preprocessor engine.
+    auto preprocessorConfig = this->makeUnique(builder->createBuilderConfig());
+
+    // Create an optimization profile so that we can specify a range of input dimensions.
+    auto profile = builder->createOptimizationProfile();
+
+    // This profile will be valid for all images whose size falls in the range of [(1, 1, 1), (1, 56, 56)]
+    // but the TensorRT will optimize for (1, 28, 28)
+    profile->setDimensions(input->getName(), OptProfileSelector::kMIN, Dims3{1, 1, 1});
+    profile->setDimensions(input->getName(), OptProfileSelector::kOPT, Dims3{1, 28, 28});
+    profile->setDimensions(input->getName(), OptProfileSelector::kMAX, Dims3{1, 56, 56});
+    preprocessorConfig->addOptimizationProfile(profile);
+    mPreprocessorEngine = this->makeUnique(builder->buildEngineWithConfig(*preprocessorNetwork, *preprocessorConfig));
+    gLogInfo << "Profile dimensions in preprocessor engine:\n";
+    gLogInfo << "    Minimum = " << mPreprocessorEngine->getProfileDimensions(0, 0, OptProfileSelector::kMIN) << '\n';
+    gLogInfo << "    Optimum = " << mPreprocessorEngine->getProfileDimensions(0, 0, OptProfileSelector::kOPT) << '\n';
+    gLogInfo << "    Maximum = " << mPreprocessorEngine->getProfileDimensions(0, 0, OptProfileSelector::kMAX)
+             << std::endl;
+}
+
+//!
+//! \brief Builds an engine for prediction (mPredictionEngine).
+//!
+//! \details This function builds an engine for the MNIST model, and updates mPredictionInputDims and
+//! mPredicitionOutputDims according to the dimensions specified by the model. The preprocessor reshapes inputs to
+//! mPredictionInputDims.
+//!
+void SampleDynamicReshape::buildPredictionEngine(const SampleUniquePtr<nvinfer1::IBuilder>& builder)
+{
+    // Create a network using the parser.
+    auto network = this->makeUnique(builder->createNetwork());
+    auto parser = nvonnxparser::createParser(*network, gLogger.getTRTLogger());
+    bool parsingSuccess = parser->parseFromFile(
+        locateFile(mParams.onnxFileName, mParams.dataDirs).c_str(), static_cast<int>(gLogger.getReportableSeverity()));
+    if (!parsingSuccess)
+    {
+        throw std::runtime_error{"Failed to parse model"};
+    }
+
+    // Attach a softmax layer to the end of the network.
+    auto softmax = network->addSoftMax(*network->getOutput(0));
+    network->unmarkOutput(*network->getOutput(0));
+    network->markOutput(*softmax->getOutput(0));
+
+    // Get information about the inputs/outputs directly from the model.
+    mPredictionInputDims = network->getInput(0)->getDimensions();
+    mPredicitionOutputDims = network->getOutput(0)->getDimensions();
+
+    // Create a builder config
+    auto config = this->makeUnique(builder->createBuilderConfig());
+    config->setMaxWorkspaceSize(16_MiB);
+    if (mParams.fp16)
+    {
+        config->setFlag(BuilderFlag::kFP16);
+    }
+    if (mParams.int8)
+    {
+        config->setFlag(BuilderFlag::kINT8);
+        samplesCommon::setAllTensorScales(network.get(), 127.0f, 127.0f);
+    }
+    // Build the prediciton engine.
+    mPredictionEngine = this->makeUnique(builder->buildEngineWithConfig(*network, *config));
+}
+
+//!
+//! \brief Prepares the model for inference by creating an execution context and allocating buffers.
+//!
+//! \details This function sets up the sample for inference. This involves allocating buffers for the inputs and
+//! outputs, as well as creating TensorRT execution contexts for both engines. This only needs to be called a single
+//! time.
+//!
+void SampleDynamicReshape::prepare()
+{
+    mPreprocessorContext = this->makeUnique(mPreprocessorEngine->createExecutionContext());
+    mPredictionContext = this->makeUnique(mPredictionEngine->createExecutionContext());
+    // Since input dimensions are not known ahead of time, we only allocate the output buffer and preprocessor output
+    // buffer.
+    mPredictionInput.resize(mPredictionInputDims);
+    mOutput.hostBuffer.resize(mPredicitionOutputDims);
+    mOutput.deviceBuffer.resize(mPredicitionOutputDims);
+}
+
+//!
+//! \brief Runs inference for this sample
+//!
+//! \details This function is the main execution function of the sample.
+//! It runs inference for using a random image from the MNIST dataset as an input.
+//!
+bool SampleDynamicReshape::infer()
+{
+    // Load a random PGM file into a host buffer, then copy to device.
+    std::random_device rd{};
+    std::default_random_engine generator{rd()};
+    std::uniform_int_distribution<int> digitDistribution{0, 9};
+    int digit = digitDistribution(generator);
+
+    Dims inputDims = this->loadPGMFile(locateFile(std::to_string(digit) + ".pgm", mParams.dataDirs));
+    mInput.deviceBuffer.resize(inputDims);
+    CHECK(cudaMemcpy(
+        mInput.deviceBuffer.data(), mInput.hostBuffer.data(), mInput.hostBuffer.nbBytes(), cudaMemcpyHostToDevice));
+
+    // Set the input size for the preprocessor
+    mPreprocessorContext->setBindingDimensions(0, inputDims);
+    // We can only run inference once all dynamic input shapes have been specified.
+    if (!mPreprocessorContext->allInputDimensionsSpecified())
+    {
+        return false;
+    }
+
+    // Run the preprocessor to resize the input to the correct shape
+    std::vector<void*> preprocessorBindings = {mInput.deviceBuffer.data(), mPredictionInput.data()};
+    // For engines using full dims, we can use executeV2, which does not include a separate batch size parameter.
+    bool status = mPreprocessorContext->executeV2(preprocessorBindings.data());
+    if (!status)
+    {
+        return false;
+    }
+
+    // Next, run the model to generate a prediction.
+    std::vector<void*> predicitonBindings = {mPredictionInput.data(), mOutput.deviceBuffer.data()};
+    status = mPredictionContext->execute(mParams.batchSize, predicitonBindings.data());
+    if (!status)
+    {
+        return false;
+    }
+
+    // Copy the outputs back to the host and verify the output.
+    CHECK(cudaMemcpy(mOutput.hostBuffer.data(), mOutput.deviceBuffer.data(), mOutput.deviceBuffer.nbBytes(),
+        cudaMemcpyDeviceToHost));
+    return validateOutput(digit);
+}
+
+//!
+//! \brief Loads a PGM file into mInput and returns the dimensions of the loaded image.
+//!
+//! \details This function loads the specified PGM file into the input host buffer.
+//!
+Dims SampleDynamicReshape::loadPGMFile(const std::string& fileName)
+{
+    std::ifstream infile(fileName, std::ifstream::binary);
+    assert(infile.is_open() && "Attempting to read from a file that is not open.");
+
+    std::string magic;
+    int h, w, max;
+    infile >> magic >> h >> w >> max;
+
+    infile.seekg(1, infile.cur);
+    Dims3 inputDims{1, h, w};
+    size_t vol = samplesCommon::volume(inputDims);
+    std::vector<uint8_t> fileData(vol);
+    infile.read(reinterpret_cast<char*>(fileData.data()), vol);
+
+    // Print an ascii representation
+    gLogInfo << "Input:\n";
+    for (size_t i = 0; i < vol; i++)
+    {
+        gLogInfo << (" .:-=+*#%@"[fileData[i] / 26]) << (((i + 1) % w) ? "" : "\n");
+    }
+    gLogInfo << std::endl;
+
+    // Normalize and copy to the host buffer.
+    mInput.hostBuffer.resize(inputDims);
+    float* hostDataBuffer = static_cast<float*>(mInput.hostBuffer.data());
+    std::transform(fileData.begin(), fileData.end(), hostDataBuffer,
+        [](uint8_t x) { return 1.0 - static_cast<float>(x / 255.0); });
+    return inputDims;
+}
+
+//!
+//! \brief Checks whether the model prediction (in mOutput) is correct.
+//!
+bool SampleDynamicReshape::validateOutput(int digit)
+{
+    const float* bufRaw = static_cast<const float*>(mOutput.hostBuffer.data());
+    std::vector<float> prob(bufRaw, bufRaw + mOutput.hostBuffer.size());
+
+    int curIndex{0};
+    for (const auto& elem : prob)
+    {
+        gLogInfo << " Prob " << curIndex << "  " << std::fixed << std::setw(5) << std::setprecision(4) << elem << " "
+                 << "Class " << curIndex << ": " << std::string(int(std::floor(elem * 10 + 0.5f)), '*') << std::endl;
+        ++curIndex;
+    }
+
+    int predictedDigit = std::max_element(prob.begin(), prob.end()) - prob.begin();
+    return digit == predictedDigit;
+}
+
+//!
+//! \brief Initializes members of the params struct using the command line args
+//!
+samplesCommon::OnnxSampleParams initializeSampleParams(const samplesCommon::Args& args)
+{
+    samplesCommon::OnnxSampleParams params;
+    if (args.dataDirs.empty()) //!< Use default directories if user hasn't provided directory paths
+    {
+        params.dataDirs.push_back("data/mnist/");
+        params.dataDirs.push_back("data/samples/mnist/");
+    }
+    else //!< Use the data directory provided by the user
+    {
+        params.dataDirs = args.dataDirs;
+    }
+    params.onnxFileName = "mnist.onnx";
+    params.inputTensorNames.push_back("Input3");
+    params.outputTensorNames.push_back("Plus214_Output_0");
+    params.batchSize = 1;
+    params.int8 = args.runInInt8;
+    params.fp16 = args.runInFp16;
+    return params;
+}
+
+//!
+//! \brief Prints the help information for running this sample
+//!
+void printHelpInfo()
+{
+    std::cout << "Usage: ./sample_dynamic_reshape [-h or --help] [-d or --datadir=<path to data directory>]"
+              << std::endl;
+    std::cout << "--help          Display help information" << std::endl;
+    std::cout << "--datadir       Specify path to a data directory, overriding the default. This option can be used "
+                 "multiple times to add multiple directories. If no data directories are given, the default is to use "
+                 "(data/samples/mnist/, data/mnist/)"
+              << std::endl;
+    std::cout << "--int8          Run in Int8 mode." << std::endl;
+    std::cout << "--fp16          Run in FP16 mode." << std::endl;
+}
+
+int main(int argc, char** argv)
+{
+    samplesCommon::Args args;
+    bool argsOK = samplesCommon::parseArgs(args, argc, argv);
+    if (!argsOK)
+    {
+        gLogError << "Invalid arguments" << std::endl;
+        printHelpInfo();
+        return EXIT_FAILURE;
+    }
+    if (args.help)
+    {
+        printHelpInfo();
+        return EXIT_SUCCESS;
+    }
+
+    auto sampleTest = gLogger.defineTest(gSampleName, argc, argv);
+
+    gLogger.reportTestStart(sampleTest);
+
+    SampleDynamicReshape sample{initializeSampleParams(args)};
+
+    sample.build();
+    sample.prepare();
+
+    if (!sample.infer())
+    {
+        return gLogger.reportFail(sampleTest);
+    }
+
+    return gLogger.reportPass(sampleTest);
+}
diff --git a/samples/opensource/sampleFasterRCNN/sampleFasterRCNN.cpp b/samples/opensource/sampleFasterRCNN/sampleFasterRCNN.cpp
index 08a377a8..83bd5320 100644
--- a/samples/opensource/sampleFasterRCNN/sampleFasterRCNN.cpp
+++ b/samples/opensource/sampleFasterRCNN/sampleFasterRCNN.cpp
@@ -95,7 +95,8 @@ class SampleFasterRCNN
     //! \brief Parses a Caffe model for FasterRCNN and creates a TensorRT network
     //!
     void constructNetwork(SampleUniquePtr<nvcaffeparser1::ICaffeParser>& parser,
-        SampleUniquePtr<nvinfer1::IBuilder>& builder, SampleUniquePtr<nvinfer1::INetworkDefinition>& network);
+        SampleUniquePtr<nvinfer1::IBuilder>& builder, SampleUniquePtr<nvinfer1::INetworkDefinition>& network,
+        SampleUniquePtr<nvinfer1::IBuilderConfig>& config);
 
     //!
     //! \brief Reads the input and mean data, preprocesses, and stores the result in a managed buffer
@@ -142,14 +143,21 @@ bool SampleFasterRCNN::build()
         return false;
     }
 
+    auto config = SampleUniquePtr<nvinfer1::IBuilderConfig>(builder->createBuilderConfig());
+    if (!config)
+    {
+        return false;
+    }
+
     auto parser = SampleUniquePtr<nvcaffeparser1::ICaffeParser>(nvcaffeparser1::createCaffeParser());
     if (!parser)
     {
         return false;
     }
-    constructNetwork(parser, builder, network);
+    constructNetwork(parser, builder, network, config);
 
-    mEngine = std::shared_ptr<nvinfer1::ICudaEngine>(builder->buildCudaEngine(*network), samplesCommon::InferDeleter());
+    mEngine = std::shared_ptr<nvinfer1::ICudaEngine>(
+        builder->buildEngineWithConfig(*network, *config), samplesCommon::InferDeleter());
     if (!mEngine)
     {
         return false;
@@ -171,7 +179,8 @@ bool SampleFasterRCNN::build()
 //! \param builder Pointer to the engine builder
 //!
 void SampleFasterRCNN::constructNetwork(SampleUniquePtr<nvcaffeparser1::ICaffeParser>& parser,
-    SampleUniquePtr<nvinfer1::IBuilder>& builder, SampleUniquePtr<nvinfer1::INetworkDefinition>& network)
+    SampleUniquePtr<nvinfer1::IBuilder>& builder, SampleUniquePtr<nvinfer1::INetworkDefinition>& network,
+    SampleUniquePtr<nvinfer1::IBuilderConfig>& config)
 {
     const nvcaffeparser1::IBlobNameToTensor* blobNameToTensor
         = parser->parse(locateFile(mParams.prototxtFileName, mParams.dataDirs).c_str(),
@@ -183,8 +192,8 @@ void SampleFasterRCNN::constructNetwork(SampleUniquePtr<nvcaffeparser1::ICaffePa
     }
 
     builder->setMaxBatchSize(mParams.batchSize);
-    builder->setMaxWorkspaceSize(16_MB);
-    samplesCommon::enableDLA(builder.get(), mParams.dlaCore);
+    config->setMaxWorkspaceSize(16_MiB);
+    samplesCommon::enableDLA(builder.get(), config.get(), mParams.dlaCore);
 }
 
 //!
diff --git a/samples/opensource/sampleGoogleNet/sampleGoogleNet.cpp b/samples/opensource/sampleGoogleNet/sampleGoogleNet.cpp
index ea979abc..0033c874 100644
--- a/samples/opensource/sampleGoogleNet/sampleGoogleNet.cpp
+++ b/samples/opensource/sampleGoogleNet/sampleGoogleNet.cpp
@@ -103,6 +103,12 @@ bool SampleGoogleNet::build()
         return false;
     }
 
+    auto config = SampleUniquePtr<nvinfer1::IBuilderConfig>(builder->createBuilderConfig());
+    if (!config)
+    {
+        return false;
+    }
+
     auto parser = SampleUniquePtr<nvcaffeparser1::ICaffeParser>(nvcaffeparser1::createCaffeParser());
     if (!parser)
     {
@@ -111,10 +117,11 @@ bool SampleGoogleNet::build()
 
     constructNetwork(parser, network);
     builder->setMaxBatchSize(mParams.batchSize);
-    builder->setMaxWorkspaceSize(16_MB);
-    samplesCommon::enableDLA(builder.get(), mParams.dlaCore);
+    config->setMaxWorkspaceSize(16_MiB);
+    samplesCommon::enableDLA(builder.get(), config.get(), mParams.dlaCore);
 
-    mEngine = std::shared_ptr<nvinfer1::ICudaEngine>(builder->buildCudaEngine(*network), samplesCommon::InferDeleter());
+    mEngine = std::shared_ptr<nvinfer1::ICudaEngine>(
+        builder->buildEngineWithConfig(*network, *config), samplesCommon::InferDeleter());
     if (!mEngine)
         return false;
 
diff --git a/samples/opensource/sampleINT8/README.md b/samples/opensource/sampleINT8/README.md
index 0e0a8aac..b10b2897 100644
--- a/samples/opensource/sampleINT8/README.md
+++ b/samples/opensource/sampleINT8/README.md
@@ -18,8 +18,6 @@
 	* [Generating batch files for non-Caffe users](#generating-batch-files-for-non-caffe-users)
 - [Running the sample](#running-the-sample)
 	* [Sample `--help` options](#sample---help-options)
-- [Additional resources](#additional-resources)
-	* [Sample `--help` options](#sample-help-options)
 - [Additional resources](#additiona-resources)
 - [License](#license)
 - [Changelog](#changelog)
@@ -36,7 +34,7 @@ Specifically, this sample demonstrates how to perform inference in 8-bit integer
 
 INT8 engines are build from 32-bit network definitions, similarly to 32-bit and 16-bit engines, but with more configuration steps. In particular, the builder and network must be configured to use INT8, which requires per-tensor dynamic ranges. The INT8 calibrator can determine how best to represent weights and activations as 8-bit integers and sets the per tensor dynamic ranges accordingly. Alternatively, you can set custom per tensor dynamic ranges; this is covered in sampleINT8API.
 
-This sample is accompanied by the [MNIST training set](https://github.com/BVLC/caffe/blob/master/data/mnist/get_mnist.sh) located in the TensorRT-5.1.0.4/data/mnist/batches directory. The packaged MNIST model that is shipped with this sample is based on [lenet.prototxt](https://github.com/BVLC/caffe/edit/master/examples/mnist/lenet.prototxt). For more information, see the [MNIST BVLC Caffe example](https://github.com/BVLC/caffe/tree/master/examples/mnist). This sample can also be used with other Image classification models, for example, [deploy.prototxt](https://github.com/BVLC/caffe/blob/master/models/bvlc_googlenet/deploy.prototxt).
+This sample is accompanied by the [MNIST training set](https://github.com/BVLC/caffe/blob/master/data/mnist/get_mnist.sh) located in the `TensorRT-x.x.x.x/data/mnist` directory, where `x.x.x.x` is your installed version of TensorRT. The packaged MNIST model that is shipped with this sample is based on [lenet.prototxt](https://github.com/BVLC/caffe/edit/master/examples/mnist/lenet.prototxt). For more information, see the [MNIST BVLC Caffe example](https://github.com/BVLC/caffe/tree/master/examples/mnist). This sample can also be used with other Image classification models, for example, [deploy.prototxt](https://github.com/BVLC/caffe/blob/master/models/bvlc_googlenet/deploy.prototxt).
 
 The packaged data set file that is shipped with this sample is based on the [MNIST data set](https://github.com/BVLC/caffe/tree/master/data/mnist). However, the batch file generation from the above data set is described in [Batch files for calibration](#batch-files-for-calibration).
 
@@ -215,102 +213,10 @@ The SoftMax layer applies the SoftMax function on the input tensor along an inpu
 
 ## Batch files for calibration
 
-You can use the calibrated data that comes with this sample or you can generate the calibration data yourself. This sample uses batch files in order to calibrate for the INT8 data. The INT8 batch file is a binary file containing a set of N images, whose format is as follows:
-
--   Four 32-bit integer values representing `{N, C, H, W}` representing the number of images N in the file, and the dimensions `{C, H, W}` of each image.
--   N 32-bit floating point data blobs of dimensions `{C, H, W}` that are used as inputs to the network.
-
-If you want to generate calibration data yourself, refer to the following sections.
-
-### Generating batch files for Caffe users
-
-Calibration requires that the images passed to the calibrator are in the same format as those that will be passed to TensorRT at runtime. For developers using Caffe for training, or who can easily transfer their network to Caffe, a supplied patchset supports capturing images after image preprocessing.
-
-The instructions are provided so that users can easily use the sample code to test accuracy and performance on classification networks. In typical production use cases, applications will have such preprocessing already implemented, and should integrate with the calibrator directly.
-
-These instructions are for [Caffe git commit 473f143f9422e7fc66e9590da6b2a1bb88e50b2f](https://github.com/BVLC/caffe.git). The patch file might be slightly different for later versions of Caffe.
-
-1.  Apply the patch. The patch can be applied by going to the root directory of the Caffe source tree and applying the patch with the command:
-    `patch -p1 < int8_caffe.patch`
-
-2.  Rebuild Caffe and set the environment variable `TENSORRT_INT8_BATCH_DIRECTORY` to the location where the batch files are to be generated.
-
-After training for 1000 iterations, there are 1003 batch files in the directory specified. This occurs because Caffe preprocesses three batches in advance of the current iteration.
-
-These batch files can then be used with the `BatchStream` and `Int8Calibrator` to calibrate the data for INT8.
-
-**Note:** When running Caffe to generate the batch files, the training prototxt, and not the deployment prototxt, is required to be used.
-
-The following example depicts the sequence of commands to run `./sample_int8 mnist` with Caffe generated batch files.
-
-1.  Navigate to the samples data directory and create an INT8 `mnist` directory:
-	```
-	cd <TensorRT>/samples/data
-	mkdir -p int8/mnist
-	cd int8/mnist
-	```
-
-	**Note:** If Caffe is not installed anywhere, ensure you clone, checkout, patch, and build Caffe at the specific commit:
-
-	```
-	git clone https://github.com/BVLC/caffe.git
-	cd caffe
-	git checkout 473f143f9422e7fc66e9590da6b2a1bb88e50b2f
-	patch -p1 < <TensorRT>/samples/mnist/int8_caffe.patch
-	mkdir build
-	pushd build
-	cmake -DUSE_OPENCV=FALSE -DUSE_CUDNN=OFF ../
-	make -j4
-	popd
-	```
-
-2.  Download the `mnist` dataset from Caffe and create a link to it:
-	```
-	bash data/mnist/get_mnist.sh
-	bash examples/mnist/create_mnist.sh
-	cd ..
-	ln -s caffe/examples .
-	```
-
-3.  Set the directory to store the batch data, execute Caffe, and link the `mnist` files:
-	```
-	mkdir batches
-	export TENSORRT_INT8_BATCH_DIRECTORY=batches
-	caffe/build/tools/caffe test -gpu 0 -iterations 1000 -model examples/mnist/lenet_train_test.prototxt -weights <TensorRT>/samples/mnist/mnist.caffemodel
-	ln -s <TensorRT>/samples/mnist/mnist.caffemodel .
-	ln -s <TensorRT>/samples/mnist/mnist.prototxt .
-	```
-
-4.  Execute sampleINT8 from the bin directory after being built with the following command:
-    `./sample_int8 mnist`
-
-### Generating batch files for non-Caffe users
-
-For developers that are not using Caffe, or cannot easily convert to Caffe, the batch files can be generated via the following sequence of steps on the input training data.
-
-1.  Subtract out the normalized mean from the dataset.
-
-2.  Crop all of the input data to the same dimensions.
-
-3.  Split the data into batch files where each batch file has `N` preprocessed images and labels.
-
-4.  Generate the batch files based on the format specified in [Batch files for calibration](#batch-files-for-calibration).
-
-The following example depicts the sequence of commands to run `./sample_int8 mnist` without Caffe.
-
-1.  Navigate to the samples data directory and create an INT8 `mnist` directory:
-	```
-	cd <TensorRT>/samples/data
-	mkdir -p int8/mnist/batches
-	cd int8/mnist
-	ln -s <TensorRT>/samples/mnist/mnist.caffemodel .
-	ln -s <TensorRT>/samples/mnist/mnist.prototxt .
-	```
-
-2.  Copy the generated batch files to the `int8/mnist/batches/` directory.
-
-3.  Execute sampleINT8 from the `bin` directory after being built with the following command:
-	`./sample_int8 mnist`
+Download the [MNIST dataset](http://yann.lecun.com/exdb/mnist/)
+    - This sample requires the [training set](http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz) and [training labels](http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz)
+    - Unzip the files obtained above using the `gunzip` utility. For example, `gunzip t10k-labels-idx1-ubyte.gz`.
+	- Lastly, copy these files to the `<TensorRT root directory>/samples/data/int8/mnist/` directory
 
 ## Running the sample
 
@@ -357,20 +263,9 @@ The following example depicts the sequence of commands to run `./sample_int8 mni
 
 	This output shows that the sample ran successfully; `PASSED`.
 
-### Sample --help options
-
-To see the full list of available options and their descriptions, use the `-h` or `--help` command line option. For example:
-```
-Usage: ./sample_int8 <network name> [-h or --help] [--datadir=<path to data directory>] [--useDLACore=<int>]
+### Sample `--help` options
 
---help          Display help information
---datadir       Specify path to a data directory, overriding the default. This option can be used multiple times to add multiple directories. If no data 					directories are given, the default is to use data/samples/ssd/ and data/ssd/
---useDLACore=N  Specify a DLA engine for layers that support DLA. Value can range from 0 to n-1, where n is the number of DLA engines on the platform.
---batch=N       Set batch size (default = 100).
---start=N       Set the first batch to be scored (default = 100). All batches before this batch will be used for calibration.
---score=N       Set the number of batches to be scored (default = 400).
-
-```
+To see the full list of available options and their descriptions, use the `-h` or `--help` command line option.
 
 # Additional resources
 
diff --git a/samples/opensource/sampleINT8/int8_caffe.patch b/samples/opensource/sampleINT8/int8_caffe.patch
new file mode 100644
index 00000000..db949941
--- /dev/null
+++ b/samples/opensource/sampleINT8/int8_caffe.patch
@@ -0,0 +1,31 @@
+diff --git a/src/caffe/layers/data_layer.cpp b/src/caffe/layers/data_layer.cpp
+index 66e6301..da615e5 100644
+--- a/src/caffe/layers/data_layer.cpp
++++ b/src/caffe/layers/data_layer.cpp
+@@ -99,6 +99,26 @@ void DataLayer<Dtype>::load_batch(Batch<Dtype>* batch) {
+   }
+   timer.Stop();
+   batch_timer.Stop();
++#define LOG_BATCHES_FOR_INT8_TESTING 1
++#if LOG_BATCHES_FOR_INT8_TESTING
++  static int sBatchId = 0;
++  char* batch_dump_dir = getenv("TENSORRT_INT8_BATCH_DIRECTORY");
++  if(batch_dump_dir != 0)
++  {
++    char buffer[1000];
++    sprintf(buffer, "batches/batch%d", sBatchId++);
++    FILE* file = fopen(buffer, "w");    
++    if(file==0)
++      abort();
++
++    int s[4] = { top_shape[0], top_shape[1], top_shape[2], top_shape[3] };
++    fwrite(s, sizeof(int), 4, file);
++    fwrite(top_data, sizeof(float), top_shape[0]*top_shape[1]*top_shape[2]*top_shape[3], file);
++    fwrite(&top_label[0], sizeof(int), top_shape[0], file);
++    fclose(file);
++  }
++#endif
++
+   DLOG(INFO) << "Prefetch batch: " << batch_timer.MilliSeconds() << " ms.";
+   DLOG(INFO) << "     Read time: " << read_time / 1000 << " ms.";
+   DLOG(INFO) << "Transform time: " << trans_time / 1000 << " ms."; | fromdos
diff --git a/samples/opensource/sampleINT8/sampleINT8.cpp b/samples/opensource/sampleINT8/sampleINT8.cpp
index af41c412..ff1e238e 100644
--- a/samples/opensource/sampleINT8/sampleINT8.cpp
+++ b/samples/opensource/sampleINT8/sampleINT8.cpp
@@ -101,8 +101,8 @@ class SampleINT8
     //! \brief Parses a Caffe model and creates a TensorRT network
     //!
     bool constructNetwork(SampleUniquePtr<nvinfer1::IBuilder>& builder,
-        SampleUniquePtr<nvinfer1::INetworkDefinition>& network, SampleUniquePtr<nvcaffeparser1::ICaffeParser>& parser,
-        DataType dataType);
+        SampleUniquePtr<nvinfer1::INetworkDefinition>& network, SampleUniquePtr<nvinfer1::IBuilderConfig>& config,
+        SampleUniquePtr<nvcaffeparser1::ICaffeParser>& parser, DataType dataType);
 
     //!
     //! \brief Reads the input and stores it in a managed buffer
@@ -139,6 +139,12 @@ bool SampleINT8::build(DataType dataType)
         return false;
     }
 
+    auto config = SampleUniquePtr<nvinfer1::IBuilderConfig>(builder->createBuilderConfig());
+    if (!config)
+    {
+        return false;
+    }
+
     auto parser = SampleUniquePtr<nvcaffeparser1::ICaffeParser>(nvcaffeparser1::createCaffeParser());
     if (!parser)
     {
@@ -151,7 +157,7 @@ bool SampleINT8::build(DataType dataType)
         return false;
     }
 
-    auto constructed = constructNetwork(builder, network, parser, dataType);
+    auto constructed = constructNetwork(builder, network, config, parser, dataType);
     if (!constructed)
     {
         return false;
@@ -195,9 +201,10 @@ bool SampleINT8::isSupported(DataType dataType)
 //! \param builder Pointer to the engine builder
 //!
 bool SampleINT8::constructNetwork(SampleUniquePtr<nvinfer1::IBuilder>& builder,
-    SampleUniquePtr<nvinfer1::INetworkDefinition>& network, SampleUniquePtr<nvcaffeparser1::ICaffeParser>& parser,
-    DataType dataType)
+    SampleUniquePtr<nvinfer1::INetworkDefinition>& network, SampleUniquePtr<nvinfer1::IBuilderConfig>& config,
+    SampleUniquePtr<nvcaffeparser1::ICaffeParser>& parser, DataType dataType)
 {
+    mEngine = nullptr;
     const nvcaffeparser1::IBlobNameToTensor* blobNameToTensor
         = parser->parse(locateFile(mParams.prototxtFileName, mParams.dataDirs).c_str(),
             locateFile(mParams.weightsFileName, mParams.dataDirs).c_str(), *network,
@@ -211,26 +218,32 @@ bool SampleINT8::constructNetwork(SampleUniquePtr<nvinfer1::IBuilder>& builder,
     // Calibrator life time needs to last until after the engine is built.
     std::unique_ptr<IInt8Calibrator> calibrator;
 
-    builder->setAverageFindIterations(1);
-    builder->setMinFindIterations(1);
-    builder->setMaxWorkspaceSize(1_GB);
-    builder->setDebugSync(true);
-    builder->setFp16Mode(dataType == DataType::kHALF);
-    builder->setInt8Mode(dataType == DataType::kINT8);
+    config->setAvgTimingIterations(1);
+    config->setMinTimingIterations(1);
+    config->setMaxWorkspaceSize(1_GiB);
+    config->setFlag(BuilderFlag::kDEBUG);
+    if (dataType == DataType::kHALF)
+    {
+        config->setFlag(BuilderFlag::kFP16);
+    }
+    if (dataType == DataType::kINT8)
+    {
+        config->setFlag(BuilderFlag::kINT8);
+    }
     builder->setMaxBatchSize(mParams.batchSize);
 
     if (dataType == DataType::kINT8)
     {
-        BatchStream calibrationStream(
-            mParams.calBatchSize, mParams.nbCalBatches, "batches/batch", "", mParams.dataDirs);
-        calibrator.reset(new Int8EntropyCalibrator2(
+        MNISTBatchStream calibrationStream(mParams.calBatchSize, mParams.nbCalBatches, "train-images-idx3-ubyte",
+            "train-labels-idx1-ubyte", mParams.dataDirs);
+        calibrator.reset(new Int8EntropyCalibrator2<MNISTBatchStream>(
             calibrationStream, 0, mParams.networkName.c_str(), mParams.inputTensorNames[0].c_str()));
-        builder->setInt8Calibrator(calibrator.get());
+        config->setInt8Calibrator(calibrator.get());
     }
 
     if (mParams.dlaCore >= 0)
     {
-        samplesCommon::enableDLA(builder.get(), mParams.dlaCore);
+        samplesCommon::enableDLA(builder.get(), config.get(), mParams.dlaCore);
         if (mParams.batchSize > builder->getMaxDLABatchSize())
         {
             gLogError << "Requested batch size " << mParams.batchSize << " is greater than the max DLA batch size of "
@@ -239,7 +252,8 @@ bool SampleINT8::constructNetwork(SampleUniquePtr<nvinfer1::IBuilder>& builder,
         }
     }
 
-    mEngine = std::shared_ptr<nvinfer1::ICudaEngine>(builder->buildCudaEngine(*network), samplesCommon::InferDeleter());
+    mEngine = std::shared_ptr<nvinfer1::ICudaEngine>(
+        builder->buildEngineWithConfig(*network, *config), samplesCommon::InferDeleter());
     if (!mEngine)
     {
         return false;
@@ -267,12 +281,13 @@ bool SampleINT8::infer(std::pair<float, float>& score, int firstScoreBatch, int
         return false;
     }
 
-    BatchStream batchStream(mParams.batchSize, nbScoreBatches, "batches/batch", "", mParams.dataDirs);
+    MNISTBatchStream batchStream(
+        mParams.batchSize, nbScoreBatches, "train-images-idx3-ubyte", "train-labels-idx1-ubyte", mParams.dataDirs);
     batchStream.skip(firstScoreBatch);
 
-    Dims3 outputDims = static_cast<Dims3&&>(context->getEngine().getBindingDimensions(
-        context->getEngine().getBindingIndex(mParams.outputTensorNames[0].c_str())));
-    int outputSize = outputDims.d[0] * outputDims.d[1] * outputDims.d[2];
+    Dims outputDims = context->getEngine().getBindingDimensions(
+        context->getEngine().getBindingIndex(mParams.outputTensorNames[0].c_str()));
+    int outputSize = samplesCommon::volume(outputDims);
     int top1{0}, top5{0};
     float totalTime{0.0f};
 
@@ -353,15 +368,9 @@ bool SampleINT8::teardown()
 //!
 bool SampleINT8::processInput(const samplesCommon::BufferManager& buffers, const float* data)
 {
-    const int inputC = mInputDims.d[0];
-    const int inputH = mInputDims.d[1];
-    const int inputW = mInputDims.d[2];
-    const int batchSize = mParams.batchSize;
-
     // Fill data buffer
     float* hostDataBuffer = static_cast<float*>(buffers.getHostBuffer(mParams.inputTensorNames[0]));
-    memcpy(hostDataBuffer, data, batchSize * inputC * inputH * inputW * sizeof(float));
-
+    std::memcpy(hostDataBuffer, data, mParams.batchSize * samplesCommon::volume(mInputDims) * sizeof(float));
     return true;
 }
 
@@ -397,20 +406,18 @@ int SampleINT8::calculateScore(
 //!
 //! \brief Initializes members of the params struct using the command line args
 //!
-SampleINT8Params initializeSampleParams(const samplesCommon::Args& args, const std::string& networkName, int batchSize)
+SampleINT8Params initializeSampleParams(const samplesCommon::Args& args, int batchSize)
 {
     SampleINT8Params params;
-    if (args.dataDirs.empty()) //!< Use default directories if user hasn't provided directory paths
-    {
-        params.dataDirs.push_back(std::string("data/") + networkName + std::string("/"));
-        params.dataDirs.push_back(std::string("int8/") + networkName + std::string("/"));
-        params.dataDirs.push_back(std::string("data/int8/") + networkName + std::string("/"));
-        params.dataDirs.push_back(std::string("data/int8_samples/") + networkName + std::string("/"));
-    }
-    else //!< Use the data directory provided by the user
-    {
-        params.dataDirs = args.dataDirs;
-    }
+    // Use directories provided by the user, in addition to default directories.
+    params.dataDirs = args.dataDirs;
+    params.dataDirs.emplace_back("data/mnist/");
+    params.dataDirs.emplace_back("int8/mnist/");
+    params.dataDirs.emplace_back("samples/mnist/");
+    params.dataDirs.emplace_back("data/samples/mnist/");
+    params.dataDirs.emplace_back("data/int8/mnist/");
+    params.dataDirs.emplace_back("data/int8_samples/mnist/");
+
     params.batchSize = batchSize;
     params.dlaCore = args.useDLACore;
     params.nbCalBatches = 10;
@@ -419,12 +426,7 @@ SampleINT8Params initializeSampleParams(const samplesCommon::Args& args, const s
     params.outputTensorNames.push_back("prob");
     params.prototxtFileName = "deploy.prototxt";
     params.weightsFileName = "mnist_lenet.caffemodel";
-    params.networkName = networkName;
-    if (networkName != std::string("mnist"))
-    {
-        params.weightsFileName = networkName + ".caffemodel";
-    }
-
+    params.networkName = "mnist";
     return params;
 }
 
@@ -433,18 +435,17 @@ SampleINT8Params initializeSampleParams(const samplesCommon::Args& args, const s
 //!
 void printHelpInfo()
 {
-    std::cout << "Usage: ./sample_int8 <network name> [-h or --help] [-d or --datadir=<path to data directory>] "
+    std::cout << "Usage: ./sample_int8 [-h or --help] [-d or --datadir=<path to data directory>] "
                  "[--useDLACore=<int>]"
               << std::endl;
     std::cout << "--help          Display help information" << std::endl;
     std::cout << "--datadir       Specify path to a data directory, overriding the default. This option can be used "
-                 "multiple times to add multiple directories. If no data directories are given, the default is to use "
-                 "data/samples/ssd/ and data/ssd/"
+                 "multiple times to add multiple directories."
               << std::endl;
     std::cout << "--useDLACore=N  Specify a DLA engine for layers that support DLA. Value can range from 0 to n-1, "
                  "where n is the number of DLA engines on the platform."
               << std::endl;
-    std::cout << "batch=N         Set batch size (default = 100)." << std::endl;
+    std::cout << "batch=N         Set batch size (default = 32)." << std::endl;
     std::cout << "start=N         Set the first batch to be scored (default = 100). All batches before this batch will "
                  "be used for calibration."
               << std::endl;
@@ -453,20 +454,19 @@ void printHelpInfo()
 
 int main(int argc, char** argv)
 {
-    if (argc < 2 || !strncmp(argv[1], "help", 4) || !strncmp(argv[1], "--help", 6) || !strncmp(argv[1], "--h", 3))
+    if (argc >= 2 && (!strncmp(argv[1], "help", 4) || !strncmp(argv[1], "--help", 6) || !strncmp(argv[1], "--h", 3)))
     {
         printHelpInfo();
         return EXIT_FAILURE;
     }
-    std::string networkName = argv[1];
 
-    // By default we score over 40K images starting at 10000, so we don't score those used to search calibration
-    int batchSize = 100;
+    // By default we score over 40K images starting at 3200, so we don't score those used to search calibration
+    int batchSize = 32;
     int firstScoreBatch = 100;
     int nbScoreBatches = 400;
 
     // Parse extra arguments
-    for (int i = 2; i < argc; i++)
+    for (int i = 1; i < argc; ++i)
     {
         if (!strncmp(argv[i], "batch=", 6))
         {
@@ -497,7 +497,7 @@ int main(int argc, char** argv)
     samplesCommon::Args args;
     samplesCommon::parseArgs(args, argc, argv);
 
-    SampleINT8 sample(initializeSampleParams(args, networkName, batchSize));
+    SampleINT8 sample(initializeSampleParams(args, batchSize));
 
     auto sampleTest = gLogger.defineTest(gSampleName, argc, argv);
 
diff --git a/samples/opensource/sampleINT8API/README.md b/samples/opensource/sampleINT8API/README.md
index aece468a..173ffdd7 100644
--- a/samples/opensource/sampleINT8API/README.md
+++ b/samples/opensource/sampleINT8API/README.md
@@ -33,7 +33,7 @@ Specifically, this sample demonstrates how to:
 
 In order to perform INT8 inference, you need to provide TensorRT with the dynamic range for each network tensor, including network input and output tensor. One way to choose the dynamic range is to use the TensorRT INT8 calibrator. But if you don't want to go that route (for example, let’s say you used quantization-aware training or you just want to use the min and max tensor values seen during training), you can skip the INT8 calibration and set custom per-network tensor dynamic ranges. This sample implements INT8 inference for the ONNX ResNet-50 model using per-network tensor dynamic ranges specified in an input file.
 
-This sample uses the [ONNX ResNet-50 model](https://github.com/onnx/models/tree/master/resnet50).
+This sample uses the [ONNX ResNet-50 model](https://github.com/onnx/models/tree/master/vision/classification/resnet/resnet50).
 
 Specifically, this sample performs the following steps:
 - [Configuring the builder to use INT8 without the INT8 calibrator](#configuring-the-builder-to-use-int8-without-the-int8-calibrator)
@@ -160,7 +160,7 @@ The ResNet-50 per tensor dynamic ranges file.
 `airliner.ppm`
 The image to be inferred.
 
-1.  Download the [ONNX ResNet-50 model](https://github.com/onnx/models/tree/master/resnet50).
+1.  Download the [ONNX ResNet-50 model](https://github.com/onnx/models/tree/master/vision/classification/resnet/resnet50).
     `wget https://s3.amazonaws.com/download.onnx/models/opset_3/resnet50.tar.gz`
 
 2.  Unpackage the model file.
@@ -239,7 +239,7 @@ To see the full list of available options and their descriptions, use the `-h` o
 
 In order to use this sample with other model files with a custom configuration, perform the following steps:
 
-1.  Download the [Image Classification model files](https://github.com/onnx/models/tree/master/models/image_classification) from GitHub.
+1.  Download the [Image Classification model files](https://github.com/onnx/models/tree/master/vision/classification) from GitHub.
 
 2.  Create an input image with a PPM extension. Resize it with the dimensions of 224x224x3.
 
@@ -298,8 +298,8 @@ The following resources provide a deeper understanding how to perform inference
 - [8-bit Inference with TensorRT](http://on-demand.gputechconf.com/gtc/2017/presentation/s7310-8-bit-inference-with-tensorrt.pdf)
 
 **Models:**
-- [ONNX ResNet-50 model](https://github.com/onnx/models/tree/master/resnet50)
-- [Image Classification Model Files](https://github.com/onnx/models/tree/master/models/image_classification)
+- [ONNX ResNet-50 model](https://github.com/onnx/models/tree/master/vision/classification/resnet/resnet50)
+- [Image Classification Model Files](https://github.com/onnx/models/tree/master/vision/classification)
 
 **Blogs:**
 - [Why are Eight Bits Enough for Deep Neural Networks?](https://petewarden.com/2015/05/23/why-are-eight-bits-enough-for-deep-neural-networks/)
diff --git a/samples/opensource/sampleINT8API/sampleINT8API.cpp b/samples/opensource/sampleINT8API/sampleINT8API.cpp
index 072d0c6f..eff94958 100644
--- a/samples/opensource/sampleINT8API/sampleINT8API.cpp
+++ b/samples/opensource/sampleINT8API/sampleINT8API.cpp
@@ -92,17 +92,17 @@ class SampleINT8API
     //!
     //! \brief Builds the network engine
     //!
-    bool build();
+    Logger::TestResult build();
 
     //!
     //! \brief Runs the TensorRT inference engine for this sample
     //!
-    bool infer();
+    Logger::TestResult infer();
 
     //!
     //! \brief Used to clean up any state created in the sample class
     //!
-    bool teardown();
+    Logger::TestResult teardown();
 
     SampleINT8APIParams mParams; //!< Stores Sample Parameter
 
@@ -445,27 +445,34 @@ bool SampleINT8API::verifyOutput(const samplesCommon::BufferManager& buffers) co
 //!
 //! \return Returns true if the engine was created successfully and false otherwise
 //!
-bool SampleINT8API::build()
+Logger::TestResult SampleINT8API::build()
 {
     auto builder = SampleUniquePtr<nvinfer1::IBuilder>(nvinfer1::createInferBuilder(gLogger.getTRTLogger()));
     if (!builder)
     {
         gLogError << "Unable to create builder object." << std::endl;
-        return false;
+        return Logger::TestResult::kFAILED;
     }
 
     auto network = SampleUniquePtr<nvinfer1::INetworkDefinition>(builder->createNetwork());
     if (!network)
     {
         gLogError << "Unable to create network object." << mParams.referenceFileName << std::endl;
-        return false;
+        return Logger::TestResult::kFAILED;
+    }
+
+    auto config = SampleUniquePtr<nvinfer1::IBuilderConfig>(builder->createBuilderConfig());
+    if (!config)
+    {
+        gLogError << "Unable to create config object." << mParams.referenceFileName << std::endl;
+        return Logger::TestResult::kFAILED;
     }
 
     auto parser = SampleUniquePtr<nvonnxparser::IParser>(nvonnxparser::createParser(*network, gLogger.getTRTLogger()));
     if (!parser)
     {
         gLogError << "Unable to create parser object." << mParams.referenceFileName << std::endl;
-        return false;
+        return Logger::TestResult::kFAILED;
     }
 
     // Parse ONNX model file to populate TensorRT INetwork
@@ -473,34 +480,34 @@ bool SampleINT8API::build()
     if (!parser->parseFromFile(mParams.modelFileName.c_str(), verbosity))
     {
         gLogError << "Unable to parse ONNX model file: " << mParams.modelFileName << std::endl;
-        return false;
+        return Logger::TestResult::kFAILED;
     }
 
     if (mParams.writeNetworkTensors)
     {
         writeNetworkTensorNames(network);
-        return false;
+        return Logger::TestResult::kWAIVED;
     }
 
     if (!builder->platformHasFastInt8())
     {
-        gLogError << "Platform does not support INT8 inference. SampleINT8API can only run in INT8 Mode." << std::endl;
-        return false;
+        gLogError << "Platform does not support INT8 inference. sampleINT8API can only run in INT8 Mode." << std::endl;
+        return Logger::TestResult::kWAIVED;
     }
 
     // Configure buider
-    builder->allowGPUFallback(true);
-    builder->setMaxWorkspaceSize(1_GB);
+    config->setFlag(BuilderFlag::kGPU_FALLBACK);
+    config->setMaxWorkspaceSize(1_GiB);
 
     // Enable INT8 model. Required to set custom per tensor dynamic range or INT8 Calibration
-    builder->setInt8Mode(true);
+    config->setFlag(BuilderFlag::kINT8);
     // Mark calibrator as null. As user provides dynamic range for each tensor, no calibrator is required
-    builder->setInt8Calibrator(nullptr);
+    config->setInt8Calibrator(nullptr);
 
     auto maxBatchSize = mParams.batchSize;
     if (mParams.dlaCore >= 0)
     {
-        samplesCommon::enableDLA(builder.get(), mParams.dlaCore);
+        samplesCommon::enableDLA(builder.get(), config.get(), mParams.dlaCore);
         if (maxBatchSize > builder->getMaxDLABatchSize())
         {
             std::cerr << "Requested batch size " << maxBatchSize << " is greater than the max DLA batch size of "
@@ -511,22 +518,23 @@ bool SampleINT8API::build()
     builder->setMaxBatchSize(maxBatchSize);
 
     // force layer to execute with required precision
-    builder->setStrictTypeConstraints(true);
+    config->setFlag(BuilderFlag::kSTRICT_TYPES);
     setLayerPrecision(network);
 
     // set INT8 Per Tensor Dynamic range
     if (!setDynamicRange(network))
     {
         gLogError << "Unable to set per tensor dynamic range." << std::endl;
-        return false;
+        return Logger::TestResult::kFAILED;
     }
 
     // build TRT engine
-    mEngine = std::shared_ptr<nvinfer1::ICudaEngine>(builder->buildCudaEngine(*network), samplesCommon::InferDeleter());
+    mEngine = std::shared_ptr<nvinfer1::ICudaEngine>(
+        builder->buildEngineWithConfig(*network, *config), samplesCommon::InferDeleter());
     if (!mEngine)
     {
         gLogError << "Unable to build cuda engine." << std::endl;
-        return false;
+        return Logger::TestResult::kFAILED;
     }
 
     // populates input output map structure
@@ -539,7 +547,7 @@ bool SampleINT8API::build()
     const int outputIndex = mEngine.get()->getBindingIndex(mInOut["output"].c_str());
     mOutputDims = mEngine.get()->getBindingDimensions(outputIndex);
 
-    return true;
+    return Logger::TestResult::kRUNNING;
 }
 
 //!
@@ -548,7 +556,7 @@ bool SampleINT8API::build()
 //! \details This function is the main execution function of the sample. It allocates
 //!          the buffer, sets inputs, executes the engine, and verifies the output
 //!
-bool SampleINT8API::infer()
+Logger::TestResult SampleINT8API::infer()
 {
     // Create RAII buffer manager object
     samplesCommon::BufferManager buffers(mEngine, mParams.batchSize);
@@ -556,7 +564,7 @@ bool SampleINT8API::infer()
     auto context = SampleUniquePtr<nvinfer1::IExecutionContext>(mEngine->createExecutionContext());
     if (!context)
     {
-        return false;
+        return Logger::TestResult::kFAILED;
     }
 
     // Read the input data into the managed buffers
@@ -564,7 +572,7 @@ bool SampleINT8API::infer()
 
     if (!prepareInput(buffers))
     {
-        return false;
+        return Logger::TestResult::kFAILED;
     }
 
     // Create CUDA stream for the execution of this inference
@@ -577,7 +585,7 @@ bool SampleINT8API::infer()
     // Asynchronously enqueue the inference work
     if (!context->enqueue(mParams.batchSize, buffers.getDeviceBindings().data(), stream, nullptr))
     {
-        return false;
+        return Logger::TestResult::kFAILED;
     }
 
     // Asynchronously copy data from device output buffers to host output buffers
@@ -590,15 +598,15 @@ bool SampleINT8API::infer()
     cudaStreamDestroy(stream);
 
     // Check and print the output of the inference
-    return verifyOutput(buffers);
+    return verifyOutput(buffers) ? Logger::TestResult::kRUNNING : Logger::TestResult::kFAILED;
 }
 
 //!
 //! \brief Used to clean up any state created in the sample class
 //!
-bool SampleINT8API::teardown()
+Logger::TestResult SampleINT8API::teardown()
 {
-    return true;
+    return Logger::TestResult::kRUNNING;
 }
 
 //!
@@ -616,7 +624,7 @@ struct SampleINT8APIArgs : public samplesCommon::Args
     std::string networkTensorsFileName{"network_tensors.txt"};
 };
 
-//! \brief This function parses arguments specific to sampleINT8API
+//! \brief This function parses arguments specific to SampleINT8API
 //!
 bool parseSampleINT8APIArgs(SampleINT8APIArgs& args, int argc, char* argv[])
 {
@@ -804,17 +812,22 @@ int main(int argc, char** argv)
     SampleINT8API sample(params);
     gLogInfo << "Building and running a INT8 GPU inference engine for " << params.modelFileName << std::endl;
 
-    if (!sample.build())
+    auto buildStatus = sample.build();
+    if (buildStatus == Logger::TestResult::kWAIVED)
+    {
+        return gLogger.reportWaive(sampleTest);
+    }
+    else if (buildStatus == Logger::TestResult::kFAILED)
     {
         return gLogger.reportFail(sampleTest);
     }
 
-    if (!sample.infer())
+    if (sample.infer() != Logger::TestResult::kRUNNING)
     {
         return gLogger.reportFail(sampleTest);
     }
 
-    if (!sample.teardown())
+    if (sample.teardown() != Logger::TestResult::kRUNNING)
     {
         return gLogger.reportFail(sampleTest);
     }
diff --git a/samples/opensource/sampleMLP/sampleMLP.cpp b/samples/opensource/sampleMLP/sampleMLP.cpp
index f99ad301..24b61a4e 100644
--- a/samples/opensource/sampleMLP/sampleMLP.cpp
+++ b/samples/opensource/sampleMLP/sampleMLP.cpp
@@ -94,8 +94,8 @@ class SampleMLP
     //!
     //! \brief Uses the API to create the MLP Network
     //!
-    bool constructNetwork(
-        SampleUniquePtr<nvinfer1::IBuilder>& builder, SampleUniquePtr<nvinfer1::INetworkDefinition>& network);
+    bool constructNetwork(SampleUniquePtr<nvinfer1::IBuilder>& builder,
+        SampleUniquePtr<nvinfer1::INetworkDefinition>& network, SampleUniquePtr<nvinfer1::IBuilderConfig>& config);
 
     //!
     //! \brief Reads the input  and stores the result in a managed buffer
@@ -153,7 +153,13 @@ bool SampleMLP::build()
         return false;
     }
 
-    auto constructed = constructNetwork(builder, network);
+    auto config = SampleUniquePtr<nvinfer1::IBuilderConfig>(builder->createBuilderConfig());
+    if (!config)
+    {
+        return false;
+    }
+
+    auto constructed = constructNetwork(builder, network, config);
     if (!constructed)
     {
         return false;
@@ -177,8 +183,8 @@ bool SampleMLP::build()
 //!
 //! \param builder Pointer to the engine builder
 //!
-bool SampleMLP::constructNetwork(
-    SampleUniquePtr<nvinfer1::IBuilder>& builder, SampleUniquePtr<nvinfer1::INetworkDefinition>& network)
+bool SampleMLP::constructNetwork(SampleUniquePtr<nvinfer1::IBuilder>& builder,
+    SampleUniquePtr<nvinfer1::INetworkDefinition>& network, SampleUniquePtr<nvinfer1::IBuilderConfig>& config)
 {
     // FC layers must still have 3 dimensions, so we create a {C, 1, 1,} matrix.
     // Currently the mnist example is only trained in FP32 mode.
@@ -213,19 +219,23 @@ bool SampleMLP::constructNetwork(
 
     // Build engine
     builder->setMaxBatchSize(mParams.batchSize);
-    builder->setMaxWorkspaceSize(16_MB);
+    config->setMaxWorkspaceSize(16_MiB);
     builder->setFp16Mode(mParams.fp16);
     builder->setInt8Mode(mParams.int8);
-    builder->setFp16Mode(mParams.fp16);
+    if (mParams.fp16)
+    {
+        config->setFlag(BuilderFlag::kFP16);
+    }
     if (mParams.int8)
     {
-        builder->setInt8Mode(true);
+        config->setFlag(BuilderFlag::kINT8);
         samplesCommon::setAllTensorScales(network.get(), 64.0f, 64.0f);
     }
 
-    samplesCommon::enableDLA(builder.get(), mParams.dlaCore);
+    samplesCommon::enableDLA(builder.get(), config.get(), mParams.dlaCore);
 
-    mEngine = std::shared_ptr<nvinfer1::ICudaEngine>(builder->buildCudaEngine(*network), samplesCommon::InferDeleter());
+    mEngine = std::shared_ptr<nvinfer1::ICudaEngine>(
+        builder->buildEngineWithConfig(*network, *config), samplesCommon::InferDeleter());
     if (!mEngine)
     {
         return false;
diff --git a/samples/opensource/sampleMLP/update_mlp.patch b/samples/opensource/sampleMLP/update_mlp.patch
new file mode 100644
index 00000000..b9573a81
--- /dev/null
+++ b/samples/opensource/sampleMLP/update_mlp.patch
@@ -0,0 +1,35 @@
+diff --git a/examples/3_NeuralNetworks/multilayer_perceptron.py b/examples/3_NeuralNetworks/multilayer_perceptron.py
+index cf04b01..44e3986 100644
+--- a/examples/3_NeuralNetworks/multilayer_perceptron.py
++++ b/examples/3_NeuralNetworks/multilayer_perceptron.py
+@@ -58,11 +58,11 @@ biases = {
+ # Create model
+ def multilayer_perceptron(x):
+     # Hidden fully connected layer with 256 neurons
+-    layer_1 = tf.add(tf.matmul(x, weights['h1']), biases['b1'])
++    layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(x, weights['h1']), biases['b1']))
+     # Hidden fully connected layer with 256 neurons
+-    layer_2 = tf.add(tf.matmul(layer_1, weights['h2']), biases['b2'])
++    layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, weights['h2']), biases['b2']))
+     # Output fully connected layer with a neuron for each class
+-    out_layer = tf.matmul(layer_2, weights['out']) + biases['out']
++    out_layer = tf.nn.sigmoid(tf.add(tf.matmul(layer_2, weights['out']), biases['out']))
+     return out_layer
+ 
+ # Construct model
+@@ -76,6 +76,9 @@ train_op = optimizer.minimize(loss_op)
+ # Initializing the variables
+ init = tf.global_variables_initializer()
+ 
++# 'Saver' op to save and restore all the variables
++saver = tf.train.Saver()
++
+ with tf.Session() as sess:
+     sess.run(init)
+ 
+@@ -102,3 +105,5 @@ with tf.Session() as sess:
+     # Calculate accuracy
+     accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
+     print("Accuracy:", accuracy.eval({X: mnist.test.images, Y: mnist.test.labels}))
++    # Save model weights to disk
++    save_path = saver.save(sess, "/tmp/sampleMLP.ckpt")
diff --git a/samples/opensource/sampleMNIST/sampleMNIST.cpp b/samples/opensource/sampleMNIST/sampleMNIST.cpp
index 5a45b8b7..ed31771a 100644
--- a/samples/opensource/sampleMNIST/sampleMNIST.cpp
+++ b/samples/opensource/sampleMNIST/sampleMNIST.cpp
@@ -123,6 +123,12 @@ bool SampleMNIST::build()
         return false;
     }
 
+    auto config = SampleUniquePtr<nvinfer1::IBuilderConfig>(builder->createBuilderConfig());
+    if (!config)
+    {
+        return false;
+    }
+
     auto parser = SampleUniquePtr<nvcaffeparser1::ICaffeParser>(nvcaffeparser1::createCaffeParser());
     if (!parser)
     {
@@ -131,15 +137,22 @@ bool SampleMNIST::build()
 
     constructNetwork(parser, network);
     builder->setMaxBatchSize(mParams.batchSize);
-    builder->setMaxWorkspaceSize(16_MB);
-    builder->allowGPUFallback(true);
-    builder->setStrictTypeConstraints(true);
-    builder->setFp16Mode(mParams.fp16);
-    builder->setInt8Mode(mParams.int8);
+    config->setMaxWorkspaceSize(16_MiB);
+    config->setFlag(BuilderFlag::kGPU_FALLBACK);
+    config->setFlag(BuilderFlag::kSTRICT_TYPES);
+    if (mParams.fp16)
+    {
+        config->setFlag(BuilderFlag::kFP16);
+    }
+    if (mParams.int8)
+    {
+        config->setFlag(BuilderFlag::kINT8);
+    }
 
-    samplesCommon::enableDLA(builder.get(), mParams.dlaCore);
+    samplesCommon::enableDLA(builder.get(), config.get(), mParams.dlaCore);
 
-    mEngine = std::shared_ptr<nvinfer1::ICudaEngine>(builder->buildCudaEngine(*network), samplesCommon::InferDeleter());
+    mEngine = std::shared_ptr<nvinfer1::ICudaEngine>(
+        builder->buildEngineWithConfig(*network, *config), samplesCommon::InferDeleter());
 
     if (!mEngine)
         return false;
@@ -236,7 +249,9 @@ void SampleMNIST::constructNetwork(
     mMeanBlob
         = SampleUniquePtr<nvcaffeparser1::IBinaryProtoBlob>(parser->parseBinaryProto(mParams.meanFileName.c_str()));
     nvinfer1::Weights meanWeights{nvinfer1::DataType::kFLOAT, mMeanBlob->getData(), inputDims.d[1] * inputDims.d[2]};
-    // For this sample, a large range based on the mean data is chosen and applied to the entire network.
+    // For this sample, a large range based on the mean data is chosen and applied to the head of the network.
+    // After the mean subtraction occurs, the range is expected to be between -127 and 127, so the rest of the network
+    // is given a generic range.
     // The preferred method is use scales computed based on a representative data set
     // and apply each one individually based on the tensor. The range here is large enough for the
     // network, but is chosen for example purposes only.
@@ -244,9 +259,12 @@ void SampleMNIST::constructNetwork(
         = samplesCommon::getMaxValue(static_cast<const float*>(meanWeights.values), samplesCommon::volume(inputDims));
 
     auto mean = network->addConstant(nvinfer1::Dims3(1, inputDims.d[1], inputDims.d[2]), meanWeights);
+    mean->getOutput(0)->setDynamicRange(-maxMean, maxMean);
+    network->getInput(0)->setDynamicRange(-maxMean, maxMean);
     auto meanSub = network->addElementWise(*network->getInput(0), *mean->getOutput(0), ElementWiseOperation::kSUB);
+    meanSub->getOutput(0)->setDynamicRange(-maxMean, maxMean);
     network->getLayer(0)->setInput(0, *meanSub->getOutput(0));
-    samplesCommon::setAllTensorScales(network.get(), maxMean, maxMean);
+    samplesCommon::setAllTensorScales(network.get(), 127.0f, 127.0f);
 }
 
 //!
diff --git a/samples/opensource/sampleMNISTAPI/sampleMNISTAPI.cpp b/samples/opensource/sampleMNISTAPI/sampleMNISTAPI.cpp
index 3848f3b8..d03e7222 100644
--- a/samples/opensource/sampleMNISTAPI/sampleMNISTAPI.cpp
+++ b/samples/opensource/sampleMNISTAPI/sampleMNISTAPI.cpp
@@ -95,8 +95,8 @@ class SampleMNISTAPI
     //!
     //! \brief Uses the API to create the MNIST Network
     //!
-    bool constructNetwork(
-        SampleUniquePtr<nvinfer1::IBuilder>& builder, SampleUniquePtr<nvinfer1::INetworkDefinition>& network);
+    bool constructNetwork(SampleUniquePtr<nvinfer1::IBuilder>& builder,
+        SampleUniquePtr<nvinfer1::INetworkDefinition>& network, SampleUniquePtr<nvinfer1::IBuilderConfig>& config);
 
     //!
     //! \brief Reads the input  and stores the result in a managed buffer
@@ -138,7 +138,13 @@ bool SampleMNISTAPI::build()
         return false;
     }
 
-    auto constructed = constructNetwork(builder, network);
+    auto config = SampleUniquePtr<nvinfer1::IBuilderConfig>(builder->createBuilderConfig());
+    if (!config)
+    {
+        return false;
+    }
+
+    auto constructed = constructNetwork(builder, network, config);
     if (!constructed)
     {
         return false;
@@ -162,8 +168,8 @@ bool SampleMNISTAPI::build()
 //!
 //! \param builder Pointer to the engine builder
 //!
-bool SampleMNISTAPI::constructNetwork(
-    SampleUniquePtr<nvinfer1::IBuilder>& builder, SampleUniquePtr<nvinfer1::INetworkDefinition>& network)
+bool SampleMNISTAPI::constructNetwork(SampleUniquePtr<nvinfer1::IBuilder>& builder,
+    SampleUniquePtr<nvinfer1::INetworkDefinition>& network, SampleUniquePtr<nvinfer1::IBuilderConfig>& config)
 {
     // Create input tensor of shape { 1, 1, 28, 28 }
     ITensor* data = network->addInput(
@@ -222,17 +228,21 @@ bool SampleMNISTAPI::constructNetwork(
 
     // Build engine
     builder->setMaxBatchSize(mParams.batchSize);
-    builder->setMaxWorkspaceSize(16_MB);
-    builder->setFp16Mode(mParams.fp16);
+    config->setMaxWorkspaceSize(16_MiB);
+    if (mParams.fp16)
+    {
+        config->setFlag(BuilderFlag::kFP16);
+    }
     if (mParams.int8)
     {
-        builder->setInt8Mode(true);
+        config->setFlag(BuilderFlag::kINT8);
         samplesCommon::setAllTensorScales(network.get(), 64.0f, 64.0f);
     }
 
-    samplesCommon::enableDLA(builder.get(), mParams.dlaCore);
+    samplesCommon::enableDLA(builder.get(), config.get(), mParams.dlaCore);
 
-    mEngine = std::shared_ptr<nvinfer1::ICudaEngine>(builder->buildCudaEngine(*network), samplesCommon::InferDeleter());
+    mEngine = std::shared_ptr<nvinfer1::ICudaEngine>(
+        builder->buildEngineWithConfig(*network, *config), samplesCommon::InferDeleter());
     if (!mEngine)
     {
         return false;
diff --git a/samples/opensource/sampleMovieLens/sampleMovieLens.cpp b/samples/opensource/sampleMovieLens/sampleMovieLens.cpp
index d8237fdd..2062d769 100644
--- a/samples/opensource/sampleMovieLens/sampleMovieLens.cpp
+++ b/samples/opensource/sampleMovieLens/sampleMovieLens.cpp
@@ -111,7 +111,8 @@ class SampleMovieLens
     //! \brief Parses a Uff model for a MLP NCF model, creates a TensorRT network, and builds a TensorRT engine.
     //!
     void constructNetwork(SampleUniquePtr<nvinfer1::IBuilder>& builder,
-        SampleUniquePtr<nvinfer1::INetworkDefinition>& network, SampleUniquePtr<nvuffparser::IUffParser>& parser);
+        SampleUniquePtr<nvinfer1::INetworkDefinition>& network, SampleUniquePtr<nvinfer1::IBuilderConfig>& config,
+        SampleUniquePtr<nvuffparser::IUffParser>& parser);
     //!
     //! \brief Copies a batch of input data from SampleMovieLensParams into managed input buffers
     //!
@@ -172,6 +173,11 @@ bool SampleMovieLens::build()
     {
         return false;
     }
+    auto config = SampleUniquePtr<nvinfer1::IBuilderConfig>(builder->createBuilderConfig());
+    if (!config)
+    {
+        return false;
+    }
     auto parser = SampleUniquePtr<nvuffparser::IUffParser>(nvuffparser::createUffParser());
     if (!parser)
     {
@@ -179,13 +185,16 @@ bool SampleMovieLens::build()
     }
 
     builder->setMaxBatchSize(mParams.batchSize);
-    builder->setMaxWorkspaceSize(1_GB);
-    builder->allowGPUFallback(true);
-    builder->setStrictTypeConstraints(true);
-    builder->setFp16Mode(mParams.fp16);
-    samplesCommon::enableDLA(builder.get(), mParams.dlaCore);
+    config->setMaxWorkspaceSize(1_GiB);
+    config->setFlag(BuilderFlag::kGPU_FALLBACK);
+    config->setFlag(BuilderFlag::kSTRICT_TYPES);
+    if (mParams.fp16)
+    {
+        config->setFlag(BuilderFlag::kFP16);
+    }
+    samplesCommon::enableDLA(builder.get(), config.get(), mParams.dlaCore);
 
-    constructNetwork(builder, network, parser);
+    constructNetwork(builder, network, config, parser);
 
     if (!mEngine)
     {
@@ -199,12 +208,15 @@ bool SampleMovieLens::build()
 //! \brief Parses a Uff model for a MLP NCF model, creates a TensorRT network, and builds a TensorRT engine.
 //!
 void SampleMovieLens::constructNetwork(SampleUniquePtr<nvinfer1::IBuilder>& builder,
-    SampleUniquePtr<nvinfer1::INetworkDefinition>& network, SampleUniquePtr<nvuffparser::IUffParser>& parser)
+    SampleUniquePtr<nvinfer1::INetworkDefinition>& network, SampleUniquePtr<nvinfer1::IBuilderConfig>& config,
+    SampleUniquePtr<nvuffparser::IUffParser>& parser)
 {
 
     nvinfer1::Dims inputIndices;
-    inputIndices.nbDims = 1;
+    inputIndices.nbDims = 3;
     inputIndices.d[0] = mParams.numMoviesPerUser;
+    inputIndices.d[1] = 1;
+    inputIndices.d[2] = 1;
 
     // There should be two input and three output tensors
     assert(mParams.inputTensorNames.size() == 2);
@@ -252,7 +264,8 @@ void SampleMovieLens::constructNetwork(SampleUniquePtr<nvinfer1::IBuilder>& buil
 
     gLogInfo << "Done constructing network..." << std::endl;
 
-    mEngine = std::shared_ptr<nvinfer1::ICudaEngine>(builder->buildCudaEngine(*network), samplesCommon::InferDeleter());
+    mEngine = std::shared_ptr<nvinfer1::ICudaEngine>(
+        builder->buildEngineWithConfig(*network, *config), samplesCommon::InferDeleter());
 }
 
 //!
@@ -378,7 +391,7 @@ void SampleMovieLens::readInputSample(std::ifstream& file, OutputParams& outPara
         }
 
         i = i.substr(0, i.size() - 1);
-        outParams.allItems.push_back(stoi(i));
+        outParams.allItems.push_back(std::stoi(i));
     }
 
     // read expected predicted max rating item
@@ -399,7 +412,7 @@ void SampleMovieLens::readInputSample(std::ifstream& file, OutputParams& outPara
         auto pos = line.find(delim);
         int32_t item = std::stoi(line.substr(0, pos - 1));
         float prob = std::stof(line.substr(pos + 2));
-        outParams.itemProbPairVec.emplace_back((make_pair(item, prob)));
+        outParams.itemProbPairVec.emplace_back((std::make_pair(item, prob)));
         std::getline(file, line);
     }
 }
@@ -485,8 +498,9 @@ bool SampleMovieLens::verifyOutput(
             float predictedProb = topKItemProb[i * mParams.topKMovies + k];
             float expectedProb = mParams.userToExpectedItemProbMap.at(userIdx).at(k).second;
             int predictedItem = mParams.userToItemsMap.at(userIdx).at(predictedIdx);
-            gLogVerbose << "|" << setw(10) << userIdx << " | " << setw(10) << predictedItem << " | " << setw(15)
-                        << expectedProb << " | " << setw(15) << predictedProb << " | " << std::endl;
+            gLogVerbose << "|" << std::setw(10) << userIdx << " | " << std::setw(10) << predictedItem << " | "
+                        << std::setw(15) << expectedProb << " | " << std::setw(15) << predictedProb << " | "
+                        << std::endl;
         }
     }
 
@@ -496,8 +510,8 @@ bool SampleMovieLens::verifyOutput(
         int maxPredictedIdx = topKItemNumber[i * mParams.topKMovies];
         int maxExpectedItem = mParams.userToExpectedItemProbMap.at(userIdx).at(0).first;
         int maxPredictedItem = mParams.userToItemsMap.at(userIdx).at(maxPredictedIdx);
-        gLogInfo << "| User :" << setw(4) << userIdx << "  |  Expected Item :" << setw(5) << maxExpectedItem
-                 << "  |  Predicted Item :" << setw(5) << maxPredictedItem << " | " << std::endl;
+        gLogInfo << "| User :" << std::setw(4) << userIdx << "  |  Expected Item :" << std::setw(5) << maxExpectedItem
+                 << "  |  Predicted Item :" << std::setw(5) << maxPredictedItem << " | " << std::endl;
     }
 
     return pass;
@@ -548,7 +562,7 @@ bool parseSampleMovieLensArgs(SampleMovieLensArgs& args, int argc, char* argv[])
         }
         else if (argStr.substr(0, 13) == "--useDLACore=" && argStr.size() > 13)
         {
-            args.dlaCore = stoi(argv[i] + 13);
+            args.dlaCore = std::stoi(argv[i] + 13);
         }
         else
         {
@@ -595,8 +609,9 @@ SampleMovieLensParams initializeSampleParams(const SampleMovieLensArgs& args)
 void printHelpInfo()
 {
     std::cout << "Usage: ./sample_movielens [-h or --help] [-b NUM_USERS] [--useDLACore=<int>] [--verbose]\n";
-    std::cout << "--help          Display help information\n";
-    std::cout << "-b NUM_USERS    Number of Users i.e. Batch Size (default numUsers==32)\n";
+    std::cout << "--help          Display help information.\n";
+    std::cout << "--verbose       Enable verbose prints.\n";
+    std::cout << "-b NUM_USERS    Number of Users i.e. Batch Size (default numUsers==32).\n";
     std::cout << "--useDLACore=N  Specify a DLA engine for layers that support "
                  "DLA. Value can range from 0 to n-1, where n is the number of "
                  "DLA engines on the platform."
diff --git a/samples/opensource/sampleMovieLens/sampleMovieLensTraining.patch b/samples/opensource/sampleMovieLens/sampleMovieLensTraining.patch
new file mode 100644
index 00000000..b52edd47
--- /dev/null
+++ b/samples/opensource/sampleMovieLens/sampleMovieLensTraining.patch
@@ -0,0 +1,419 @@
+Copyright 1993-2019 NVIDIA Corporation.  All rights reserved.
+
+NOTICE TO LICENSEE:
+
+This source code and/or documentation ("Licensed Deliverables") are subject to
+NVIDIA intellectual property rights under U.S. and international Copyright
+laws.
+
+These Licensed Deliverables contained herein is PROPRIETARY and CONFIDENTIAL
+to NVIDIA and is being provided under the terms and conditions of a form of
+NVIDIA software license agreement by and between NVIDIA and Licensee ("License
+Agreement") or electronically accepted by Licensee.  Notwithstanding any terms
+or conditions to the contrary in the License Agreement, reproduction or
+disclosure of the Licensed Deliverables to any third party without the express
+written consent of NVIDIA is prohibited.
+
+NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE LICENSE
+AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THESE
+LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS
+OR IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD
+TO THESE LICENSED DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF
+MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE LICENSE
+AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT,
+INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM
+LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+PERFORMANCE OF THESE LICENSED DELIVERABLES.
+
+U.S. Government End Users.  These Licensed Deliverables are a "commercial
+item" as that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
+"commercial computer software" and "commercial computer software
+documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) and is
+provided to the U.S. Government only as a commercial end item.  Consistent
+with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995),
+all U.S. Government End Users acquire the Licensed Deliverables with only
+those rights set forth herein.
+
+Any use of the Licensed Deliverables in individual and commercial software
+must include, in the user documentation and internal comments to the code, the
+above Disclaimer and U.S. Government End Users Notice.
+
+diff --git a/MLP.py b/MLP.py
+index 70566c7..93c0d53 100644
+--- a/MLP.py
++++ b/MLP.py
+@@ -1,30 +1,27 @@
+ '''
+ Created on Aug 9, 2016
+ Keras Implementation of Multi-Layer Perceptron (GMF) recommender model in:
+-He Xiangnan et al. Neural Collaborative Filtering. In WWW 2017.
++He Xiangnan et al. Neural Collaborative Filtering. In WWW 2017.
+
+ @author: Xiangnan He (xiangnanhe@gmail.com)
+ '''
+-
++import shutil
+ import numpy as np
++import tensorflow as tf
+
+-import theano
+-import theano.tensor as T
+-import keras
+-from keras import backend as K
+-from keras import initializations
+-from keras.regularizers import l2, activity_l2
+-from keras.models import Sequential, Graph, Model
+-from keras.layers.core import Dense, Lambda, Activation
+-from keras.layers import Embedding, Input, Dense, merge, Reshape, Merge, Flatten, Dropout
+-from keras.constraints import maxnorm
+-from keras.optimizers import Adagrad, Adam, SGD, RMSprop
++from tensorflow import keras
++from tensorflow.python.keras import initializers
++from tensorflow.python.keras.regularizers import l2
++from tensorflow.python.keras.models import Model
++from tensorflow.python.keras.layers import Dense, Embedding, Input, Flatten, \
++     concatenate
++from tensorflow.python.keras.optimizers import Adam, Adagrad, SGD, RMSprop
++from tensorflow.python.framework import graph_util
+ from evaluate import evaluate_model
++from evaluate import infer_model
+ from Dataset import Dataset
+ from time import time
+-import sys
+ import argparse
+-import multiprocessing as mp
+
+ #################### Arguments ####################
+ def parse_args():
+@@ -54,7 +51,50 @@ def parse_args():
+     return parser.parse_args()
+
+ def init_normal(shape, name=None):
+-    return initializations.normal(shape, scale=0.01, name=name)
++    return initializers.he_normal()
++
++def freeze_checkpoint_graph(output_node_names, checkpoint_model_folder, output_graph_filename):
++    # retrieve the checkpoint fullpath
++    checkpoint = tf.train.get_checkpoint_state(checkpoint_model_folder)
++    input_checkpoint = checkpoint.model_checkpoint_path
++
++    print(input_checkpoint)
++    # precise the file fullname of our freezed graph
++    absolute_model_folder = "/".join(input_checkpoint.split("/")[:-1])
++
++    # we clear devices, to allow tensorflow to control on the loading, where it wants operations to be calculated
++    clear_devices = True
++
++    # the checkpoint directory has - .meta and .data i.e. weights file to be retrieved
++    saver = tf.train.import_meta_graph(input_checkpoint + '.meta', clear_devices=clear_devices)
++
++    # retrieve protobuf graph definition
++    # returns the default graph of the current thread - will be the innermost graph
++    # on which Graph.as_default() context has been entered - global_default_graph if non has been explicitly created
++    graph = tf.get_default_graph()
++
++    # retrieve graph def for a grpah
++    input_graph_def = graph.as_graph_def()
++
++    # print the output nodes
++    output_node_list = [n.name for n in tf.get_default_graph().as_graph_def().node]
++
++    # start the session and restore the weights
++    with tf.Session() as sess:
++        saver.restore(sess, input_checkpoint)
++
++        # in order to freeze the graph - need to export the variables to constants
++        output_graph_def = graph_util.convert_variables_to_constants(
++                sess,   # session have weights stored
++                input_graph_def,
++                output_node_names.split(",")
++        )
++
++        # finally we serialize and dump the output graph to the filesystem
++        with tf.gfile.GFile(output_graph_filename, "wb") as f:
++            f.write(output_graph_def.SerializeToString())
++
++        print("[FREEZE_INFO] ", len(output_graph_def.node), " ops in the final graph.")
+
+ def get_model(num_users, num_items, layers = [20,10], reg_layers=[0,0]):
+     assert len(layers) == len(reg_layers)
+@@ -63,29 +103,43 @@ def get_model(num_users, num_items, layers = [20,10], reg_layers=[0,0]):
+     user_input = Input(shape=(1,), dtype='int32', name = 'user_input')
+     item_input = Input(shape=(1,), dtype='int32', name = 'item_input')
+
+-    MLP_Embedding_User = Embedding(input_dim = num_users, output_dim = layers[0]/2, name = 'user_embedding',
+-                                  init = init_normal, W_regularizer = l2(reg_layers[0]), input_length=1)
+-    MLP_Embedding_Item = Embedding(input_dim = num_items, output_dim = layers[0]/2, name = 'item_embedding',
+-                                  init = init_normal, W_regularizer = l2(reg_layers[0]), input_length=1)
+-
++    MLP_Embedding_User = Embedding(input_dim=num_users,
++                                   output_dim=int(layers[0] // 2),
++                                   name='user_embedding',
++                                   embeddings_initializer='random_uniform',
++                                   embeddings_regularizer=l2(reg_layers[0]),
++                                   input_length=1)
++    MLP_Embedding_Item = Embedding(input_dim=num_items,
++                                   output_dim=int(layers[0] // 2),
++                                   name='item_embedding',
++                                   embeddings_initializer='random_uniform',
++                                   embeddings_regularizer=l2(reg_layers[0]),
++                                   input_length=1)
+     # Crucial to flatten an embedding vector!
+     user_latent = Flatten()(MLP_Embedding_User(user_input))
+     item_latent = Flatten()(MLP_Embedding_Item(item_input))
+-
++
+     # The 0-th layer is the concatenation of embedding layers
+-    vector = merge([user_latent, item_latent], mode = 'concat')
+-
++    vector = concatenate([user_latent, item_latent])
++
+     # MLP layers
+-    for idx in xrange(1, num_layer):
+-        layer = Dense(layers[idx], W_regularizer= l2(reg_layers[idx]), activation='relu', name = 'layer%d' %idx)
++    for idx in range(1, num_layer):
++        print(idx, " : ", layers[idx])
++        layer = Dense(layers[idx],
++                      kernel_regularizer=l2(reg_layers[idx]),
++                      activation='relu',
++                      name='layer%d'%idx)
+         vector = layer(vector)
+-
++
+     # Final prediction layer
+-    prediction = Dense(1, activation='sigmoid', init='lecun_uniform', name = 'prediction')(vector)
+-
+-    model = Model(input=[user_input, item_input],
+-                  output=prediction)
+-
++    prediction = Dense(1,
++                       activation='sigmoid',
++                       kernel_initializer='lecun_uniform',
++                       name='prediction')(vector)
++
++    model = Model(inputs=[user_input, item_input],
++                  outputs=prediction)
++
+     return model
+
+ def get_train_instances(train, num_negatives):
+@@ -97,9 +151,10 @@ def get_train_instances(train, num_negatives):
+         item_input.append(i)
+         labels.append(1)
+         # negative instances
+-        for t in xrange(num_negatives):
++        for t in range(num_negatives):
+             j = np.random.randint(num_items)
+-            while train.has_key((u, j)):
++            #while train.has_key((u, j)):
++            while (u, j) in train:
+                 j = np.random.randint(num_items)
+             user_input.append(u)
+             item_input.append(j)
+@@ -118,61 +173,73 @@ if __name__ == '__main__':
+     batch_size = args.batch_size
+     epochs = args.epochs
+     verbose = args.verbose
+-
++
+     topK = 10
+     evaluation_threads = 1 #mp.cpu_count()
+     print("MLP arguments: %s " %(args))
+-    model_out_file = 'Pretrain/%s_MLP_%s_%d.h5' %(args.dataset, args.layers, time())
+-
++
+     # Loading data
+     t1 = time()
+     dataset = Dataset(args.path + args.dataset)
+     train, testRatings, testNegatives = dataset.trainMatrix, dataset.testRatings, dataset.testNegatives
+     num_users, num_items = train.shape
+-    print("Load data done [%.1f s]. #user=%d, #item=%d, #train=%d, #test=%d"
++    print("Load data done [%.1f s]. #user=%d, #item=%d, #train=%d, #test=%d"
+           %(time()-t1, num_users, num_items, train.nnz, len(testRatings)))
+-
++
+     # Build model
+     model = get_model(num_users, num_items, layers, reg_layers)
+-    if learner.lower() == "adagrad":
++    if learner.lower() == "adagrad":
+         model.compile(optimizer=Adagrad(lr=learning_rate), loss='binary_crossentropy')
+     elif learner.lower() == "rmsprop":
+         model.compile(optimizer=RMSprop(lr=learning_rate), loss='binary_crossentropy')
+     elif learner.lower() == "adam":
+         model.compile(optimizer=Adam(lr=learning_rate), loss='binary_crossentropy')
+     else:
+-        model.compile(optimizer=SGD(lr=learning_rate), loss='binary_crossentropy')
+-
++        model.compile(optimizer=SGD(lr=learning_rate), loss='binary_crossentropy')
++
+     # Check Init performance
+     t1 = time()
+     (hits, ndcgs) = evaluate_model(model, testRatings, testNegatives, topK, evaluation_threads)
+     hr, ndcg = np.array(hits).mean(), np.array(ndcgs).mean()
+     print('Init: HR = %.4f, NDCG = %.4f [%.1f]' %(hr, ndcg, time()-t1))
+-
++
++    saver = tf.train.Saver()
++
+     # Train model
+     best_hr, best_ndcg, best_iter = hr, ndcg, -1
+-    for epoch in xrange(epochs):
++    for epoch in range(epochs):
++        print("Training epochs : ", epoch)
+         t1 = time()
+         # Generate training instances
+         user_input, item_input, labels = get_train_instances(train, num_negatives)
+-
+-        # Training
++
++        # Training
+         hist = model.fit([np.array(user_input), np.array(item_input)], #input
+-                         np.array(labels), # labels
+-                         batch_size=batch_size, nb_epoch=1, verbose=0, shuffle=True)
++                         np.array(labels), # labels
++                         batch_size=batch_size, epochs=1, verbose=0, shuffle=True)
+         t2 = time()
+
+         # Evaluation
+         if epoch %verbose == 0:
+             (hits, ndcgs) = evaluate_model(model, testRatings, testNegatives, topK, evaluation_threads)
+             hr, ndcg, loss = np.array(hits).mean(), np.array(ndcgs).mean(), hist.history['loss'][0]
+-            print('Iteration %d [%.1f s]: HR = %.4f, NDCG = %.4f, loss = %.4f [%.1f s]'
++            print('Iteration %d [%.1f s]: HR = %.4f, NDCG = %.4f, loss = %.4f [%.1f s]'
+                   % (epoch,  t2-t1, hr, ndcg, loss, time()-t2))
+             if hr > best_hr:
+                 best_hr, best_ndcg, best_iter = hr, ndcg, epoch
+-                if args.out > 0:
+-                    model.save_weights(model_out_file, overwrite=True)
++    # Model is trained, all epochs are done, save the golden data
++    infer_model(model, testRatings, testNegatives, topK, evaluation_threads)
+
+     print("End. Best Iteration %d:  HR = %.4f, NDCG = %.4f. " %(best_iter, best_hr, best_ndcg))
+-    if args.out > 0:
+-        print("The best MLP model is saved to %s" %(model_out_file))
++    # Get keras session
++    save_path = saver.save(tf.keras.backend.get_session(), './ckpts/sampleMovieLens.ckpt')
++
++    output_node_names = "prediction/Sigmoid"
++    checkpoint_model_folder = "./ckpts/";
++    output_graph_filename = "sampleMovieLens.pb"
++
++    # convert checkpoints to frozen graph
++    freeze_checkpoint_graph(output_node_names, checkpoint_model_folder, output_graph_filename)
++
++    # delete checkpoints file
++    shutil.rmtree("./ckpts")
+diff --git a/evaluate.py b/evaluate.py
+index 729f07a..6079a8a 100644
+--- a/evaluate.py
++++ b/evaluate.py
+@@ -20,6 +20,71 @@ _testRatings = None
+ _testNegatives = None
+ _K = None
+
++def infer_model(model, testRatings, testNegatives, K, num_thread):
++    """
++    Evaluate the performance (Hit_Ratio, NDCG) of top-K recommendation
++    Return: score of each test rating.
++    """
++    global _model
++    global _testRatings
++    global _testNegatives
++    global _K
++    _model = model
++    _testRatings = testRatings
++    _testNegatives = testNegatives
++    _K = K
++
++    hits, ndcgs = [],[]
++    if(num_thread > 1): # Multi-thread
++        pool = multiprocessing.Pool(processes=num_thread)
++        res = pool.map(eval_one_rating, range(len(_testRatings)))
++        pool.close()
++        pool.join()
++        hits = [r[0] for r in res]
++        ndcgs = [r[1] for r in res]
++        return (hits, ndcgs)
++
++    # open file to overwrite
++    r = open("./movielens_ratings.txt", 'w')
++    # Single thread
++    for idx in range(len(_testRatings)):
++        (hr,ndcg) = infer_one_rating(idx, r)
++        hits.append(hr)
++        ndcgs.append(ndcg)
++    return (hits, ndcgs)
++def infer_one_rating(idx, r):
++    rating = _testRatings[idx]
++    items = _testNegatives[idx]
++    u = rating[0]
++    gtItem = rating[1]
++    items.append(gtItem)
++
++    # Get prediction scores
++    map_item_score = {}
++    users = np.full(len(items), u, dtype = 'int32')
++    predictions = _model.predict([users, np.array(items)],
++                                 batch_size=100, verbose=0)
++    for i in range(len(items)):
++        item = items[i]
++        map_item_score[item] = predictions[i]
++
++    # Evaluate top rank list
++    ranklist = heapq.nlargest(_K, map_item_score, key=map_item_score.get)
++
++    r.write("user : %s\n" % u)
++    r.write("items : %s\n" % items)
++    r.write("predicted_max_rating_item : %s\n" % ranklist[0])
++    r.write("predicted_max_rating_prob : %s\n" % map_item_score[ranklist[0]])
++    r.write("Top 10 Ratings:\n")
++    for i in range(len(ranklist)):
++        r.write("%s : %s\n" % (int(ranklist[i]), float(map_item_score[ranklist[i]])))
++    r.write("#########################################################\n")
++
++    hr = getHitRatio(ranklist, gtItem)
++    ndcg = getNDCG(ranklist, gtItem)
++    items.pop()
++    return (hr, ndcg)
++
+ def evaluate_model(model, testRatings, testNegatives, K, num_thread):
+     """
+     Evaluate the performance (Hit_Ratio, NDCG) of top-K recommendation
+@@ -44,7 +109,7 @@ def evaluate_model(model, testRatings, testNegatives, K, num_thread):
+         ndcgs = [r[1] for r in res]
+         return (hits, ndcgs)
+     # Single thread
+-    for idx in xrange(len(_testRatings)):
++    for idx in range(len(_testRatings)):
+         (hr,ndcg) = eval_one_rating(idx)
+         hits.append(hr)
+         ndcgs.append(ndcg)
+@@ -61,15 +126,15 @@ def eval_one_rating(idx):
+     users = np.full(len(items), u, dtype = 'int32')
+     predictions = _model.predict([users, np.array(items)],
+                                  batch_size=100, verbose=0)
+-    for i in xrange(len(items)):
++    for i in range(len(items)):
+         item = items[i]
+         map_item_score[item] = predictions[i]
+-    items.pop()
+
+     # Evaluate top rank list
+     ranklist = heapq.nlargest(_K, map_item_score, key=map_item_score.get)
+     hr = getHitRatio(ranklist, gtItem)
+     ndcg = getNDCG(ranklist, gtItem)
++    items.pop()
+     return (hr, ndcg)
+
+ def getHitRatio(ranklist, gtItem):
+@@ -79,7 +144,7 @@ def getHitRatio(ranklist, gtItem):
+     return 0
+
+ def getNDCG(ranklist, gtItem):
+-    for i in xrange(len(ranklist)):
++    for i in range(len(ranklist)):
+         item = ranklist[i]
+         if item == gtItem:
+             return math.log(2) / math.log(i+2)
diff --git a/samples/opensource/sampleMovieLensMPS/CMakeLists.txt b/samples/opensource/sampleMovieLensMPS/CMakeLists.txt
new file mode 100644
index 00000000..29a4b428
--- /dev/null
+++ b/samples/opensource/sampleMovieLensMPS/CMakeLists.txt
@@ -0,0 +1,22 @@
+#
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+SET(SAMPLE_SOURCES
+    sampleMovieLensMPS.cpp
+)
+
+set(SAMPLE_PARSERS "uff")
+
+include(../../CMakeSamplesTemplate.txt)
diff --git a/samples/opensource/sampleMovieLensMPS/README.md b/samples/opensource/sampleMovieLensMPS/README.md
new file mode 100644
index 00000000..e6f4fd69
--- /dev/null
+++ b/samples/opensource/sampleMovieLensMPS/README.md
@@ -0,0 +1,198 @@
+# Movie Recommendation Using MPS (Multi-Process Service)
+
+
+**Table Of Contents**
+- [Description](#description)
+- [How does this sample work?](#how-does-this-sample-work)
+	* [Importing a network to TensorRT](#importing-a-network-to-tensorrt)
+	* [Running inference](#running-inference)
+	* [Verifying the output](#verifying-the-output)
+	* [TensorRT API layers and ops](#tensorrt-api-layers-and-ops)
+- [Training an NCF network](#training-an-ncf-network)
+- [Running the sample](#running-the-sample)
+	* [Sample `--help` options](#sample---help-options)
+- [Additional resources](#additional-resources)
+- [License](#license)
+- [Changelog](#changelog)
+- [Known issues](#known-issues)
+
+## Description
+
+This sample, sampleMovieLensMPS, is an end-to-end sample that imports a trained TensorFlow model and predicts the highest rated movie for each user using MPS (Multi-Process Service).
+
+MPS allows multiple CUDA processes to share single GPU context. With MPS, multiple overlapping kernel execution and `memcpy` operations from different processes can be scheduled concurrently to achieve maximum utilization. This can be especially effective in increasing parallelism for small networks with low resource utilization such as those primarily consisting of a series of small MLPs.
+
+This sample is identical to [sampleMovieLens](https://docs.nvidia.com/deeplearning/sdk/tensorrt-sample-support-guide/index.html#sample_movie) in terms of functionality, but is modified to support concurrent execution in multiple processes. Specifically, this sample demonstrates how to generate weights for a MovieLens dataset that TensorRT can then accelerate.
+
+**Note:** Currently, sampleMovieLensMPS supports only Linux x86-64 (includes Ubuntu and RedHat) desktop users.
+
+## How does this sample work?
+
+The network is trained in TensorFlow on the [MovieLens dataset](https://grouplens.org/datasets/movielens/) containing 6,040 users and 3,706 movies. The NCF recommender system is based off of the [Neural Collaborative Filtering](https://arxiv.org/abs/1708.05031) paper.
+
+Each query to the network consists of a `userID` and list of `MovieIDs`. The network predicts the highest-rated movie for each user. As trained parameters, the network has embeddings for users and movies, and weights for a sequence of MLPs.
+
+Specifically, this sample:
+-   [Imports a network into TensorRT](#importing-a-network-to-tensorrt)    
+-   [Runs the inference](#running-inference)    
+-   [Verifies its output](#verifying-the-output)
+
+### Importing a network to TensorRT
+
+The network is converted from Tensorflow using the UFF converter (see [Converting A Frozen Graph To UFF](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#samplecode3)), and imported using the UFF parser. Constant layers are used to represent the trained parameters within the network, and the MLPs are implemented using MatrixMultiply layers. A TopK operation is added manually after parsing to find the highest rated movie for the given user.
+
+### Running inference
+
+The sample fills the input buffer with `userIDs` and their corresponding lists of `MovieIDs`, which are loaded from `movielens_ratings.txt`. Then, it launches the inference to predict the rating probabilities for the movies using TensorRT. The inference will be launched on multiple processes. When MPS is enabled, the processes will share one single CUDA context to reduce context overhead. See [Multi-Process Service Introduction](https://docs.nvidia.com/deploy/mps/index.html) for more details about MPS.
+
+### Verifying the output
+
+Finally, the sample compares the outputs predicted by TensorRT with the expected outputs which are given by `movielens_ratings.txt`. For each user, the `MovieID` with the highest probability should match the expected highest-rated `MovieID`. In the verbose mode, the sample also prints out the probability, which should be close to the expected probability.
+
+### TensorRT API layers and ops
+
+In this sample, the following layers are used.  For more information about these layers, see the [TensorRT Developer Guide: Layers](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#layers) documentation.
+
+[Activation layer](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#activation-layer)
+The Activation layer implements element-wise activation functions.
+
+[MatrixMultiply layer](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#matrixmultiply-layer)
+The MatrixMultiply layer implements matrix multiplication for a collection of matrices.
+
+[Scale layer](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#scale-layer)
+The Scale layer implements a per-tensor, per-channel, or per-element affine transformation and/or exponentiation by constant values.
+
+[Shuffle layer](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#shuffle-layer)
+The Shuffle layer implements a reshape and transpose operator for tensors.
+
+### TensorRT API layers and ops
+
+In this sample, the following layers are used.  For more information about these layers, see the [TensorRT Developer Guide: Layers](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#layers) documentation.
+
+[Activation layer](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#activation-layer)
+The Activation layer implements element-wise activation functions.
+
+[MatrixMultiply layer](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#matrixmultiply-layer)
+The MatrixMultiply layer implements matrix multiplication for a collection of matrices.
+
+[Scale layer](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#scale-layer)
+The Scale layer implements a per-tensor, per-channel, or per-element affine transformation and/or exponentiation by constant values.
+
+[Shuffle layer](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#shuffle-layer)
+The Shuffle layer implements a reshape and transpose operator for tensors.
+
+[TopK layer](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#topk-layer)
+The TopK layer finds the top `K` maximum (or minimum) elements along a dimension, returning a reduced tensor and a tensor of index positions.
+
+## Training an NCF network
+
+This sample comes with a pre-trained model. However, if you want to train your own model, you would need to also convert the model weights to UFF format before you can run the sample. For step-by-step instructions, refer to the `README.md` file in the `sampleMovieLens` directory.
+
+## Running the sample
+
+1. Compile this sample by running `make` in the `<TensorRT root directory>/samples/sampleMovieLensMPS` directory. The binary named `sample_movielens_mps` will be created in the `<TensorRT root directory>/bin` directory.
+	```
+	cd <TensorRT root directory>/samples/sampleMovieLensMPS
+	make
+	```
+	Where `<TensorRT root directory>` is where you installed TensorRT.
+
+
+2. Set-up an MPS server.
+	```
+	export CUDA_VISIBLE_DEVICES=<GPU_ID>
+	nvidia-smi -i <GPU_ID> -c EXCLUSIVE_PROCESSexport CUDA_VISIBLE_DEVICES=0
+	export CUDA_MPS_PIPE_DIRECTORY=/tmp/nvidia-mps # Select a location that's accessible to the given $UID
+	export CUDA_MPS_LOG_DIRECTORY=/tmp/nvidia-log # Select a location that's accessible to the given $UID
+	nvidia-cuda-mps-control -d # Start the daemon.
+	```
+	The log files of MPS are located at:
+	```
+	$CUDA_MPS_LOG_DIRECTORY/control.log
+	$CUDA_MPS_LOG_DIRECTORY/server.log
+	```
+3. Set-up an MPS client. Set the following variables in the client process environment. The `CUDA_VISIBLE_DEVICES` variable should not be set in the client's environment.
+	```
+	export CUDA_MPS_PIPE_DIRECTORY=/tmp/nvidia-mps # Set to the same location as the MPS control daemon
+	export CUDA_MPS_LOG_DIRECTORY=/tmp/nvidia-log # Set to the same location as the MPS control daemon
+	```
+4. Run the sample from an MPS client to predict the highest-rated movie for each user on multiple processes.
+	```
+	cd <TensorRT Install>/bin
+	./sample_movielens_mps (default batch=32 i.e. num of users, Number of processes=1)
+	./sample_movielens_mps -b <bSize> -p <nbProc> (bSize=Batch size i.e. num of users, nbProc=Number of processes)
+	./sample_movielens_mps --verbose (prints inputs, groundtruth values, expected vs predicted probabilities)
+	```
+5. Verify that the sample ran successfully. If the sample runs successfully you should see output similar to the following:
+	```
+	&&&& RUNNING TensorRT.sample_movielens_mps # build/cuda-		10.0/7.3/x86_64/sample_movielens_mps -b 2 -p 2
+	[I] data/samples/movielens/movielens_ratings.txt
+	[I] Begin parsing model...
+	[I] End parsing model...
+	[I] End building engine...
+	[I] Done execution in process: 24136 . Duration : 214.272 microseconds.
+	[I] Num of users : 2
+	[I] Num of Movies : 100
+	[I] | PID : 24136 | User : 0 | Expected Item : 128 | Predicted Item : 128 |
+	[I] | PID : 24136 | User : 1 | Expected Item : 133 | Predicted Item : 133 |
+	[I] Done execution in process: 24135 . Duration : 214.176 microseconds.
+	[I] Num of users : 2
+	[I] Num of Movies : 100
+	[I] | PID : 24135 | User : 0 | Expected Item : 128 | Predicted Item : 128 |
+	[I] | PID : 24135 | User : 1 | Expected Item : 133 | Predicted Item : 133 |
+	[I] Number of processes executed: 2. Number of processes failed: 0.
+	[I] Total MPS Run Duration: 1737.51 milliseconds.
+	&&&& PASSED TensorRT.sample_movielens_mps # build/cuda-	10.0/7.3/x86_64/sample_movielens_mps -b 2 -p 2
+	```
+	This output shows that the sample ran successfully; `PASSED`. The output also shows that the 	predicted items for each user matches the expected items and the duration of the execution. Finally, the sample prints out the PIDs of the processes, showing that the inference is launched on multiple processes.
+
+6. To restore the system to its original state, shutdown MPS, if needed.
+	`echo quit | nvidia-cuda-mps-control`
+
+### Sample --help options
+
+To see the full list of available options and their descriptions, use the `-h` or `--help` command line option. For example:
+```
+Usage:
+         ./sample_movielens_mps [-h] [-b NUM_USERS] [-p NUM_PROCESSES] [--useDLACore=<int>] [--verbose]
+        -h             Display help information. All single dash options enable perf mode.
+        -b             Number of Users i.e. Batch Size (default numUsers=32).
+        -p             Number of child processes to launch (default nbProcesses=1. Using MPS with this option is strongly recommended).
+        --useDLACore=N Specify a DLA engine for layers that support DLA. Value can range from 0 to n-1, where n is the number of DLA engines on the platform.
+        --verbose      Enable verbose prints.
+        --int8         Run in Int8 mode.
+        --fp16         Run in FP16 mode.
+```
+
+# Additional resources
+
+The following resources provide a deeper understanding about sampleMovieLensMPS:
+
+**MovieLensMPS**
+- [MovieLens dataset](https://grouplens.org/datasets/movielens/)
+- [Neural Collaborative Filtering Paper](https://arxiv.org/abs/1708.05031)
+- [Multi-Process Service Introduction](https://docs.nvidia.com/deploy/mps/index.html)
+
+**Models**
+- [Neural Collaborative Filtering GitHub Repo](https://github.com/hexiangnan/neural_collaborative_filtering)
+
+**Documentation**
+- [Introduction To NVIDIA’s TensorRT Samples](https://docs.nvidia.com/deeplearning/sdk/tensorrt-sample-support-guide/index.html#samples)
+- [Jupyter Notebook Tutorial for SampleMovieLens](https://developer.download.nvidia.com/compute/machine-learning/tensorrt/models/sampleMLP-notebook.html?ncid=--47568)
+- [Working With TensorRT Using The C++ API](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#c_topics)
+- [NVIDIA’s TensorRT Documentation Library](https://docs.nvidia.com/deeplearning/sdk/tensorrt-archived/index.html)
+
+# License
+
+For terms and conditions for use, reproduction, and distribution, see the [TensorRT Software License Agreement](https://docs.nvidia.com/deeplearning/sdk/tensorrt-sla/index.html) documentation.
+
+
+# Changelog
+
+February 2019
+This `README.md` file was recreated, updated and reviewed.
+
+
+# Known issues
+
+- Since the UFF converter is not currently supported on Windows, the model cannot be converted to UFF on Windows systems. It is still possible to use the UFF file shipped with the sample.
diff --git a/samples/opensource/sampleMovieLensMPS/preprocess.py b/samples/opensource/sampleMovieLensMPS/preprocess.py
new file mode 100644
index 00000000..ca641ad1
--- /dev/null
+++ b/samples/opensource/sampleMovieLensMPS/preprocess.py
@@ -0,0 +1,23 @@
+#
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import graphsurgeon as gs
+import tensorflow as tf
+
+def preprocess(dynamic_graph):
+    axis = dynamic_graph.find_nodes_by_path("concatenate/concat/axis")[0]
+    # Set axis to 2, because of discrepancies between TensorFlow and TensorRT.
+    axis.attr["value"].tensor.int_val[0] = 2
diff --git a/samples/opensource/sampleMovieLensMPS/sampleMovieLensMPS.cpp b/samples/opensource/sampleMovieLensMPS/sampleMovieLensMPS.cpp
new file mode 100644
index 00000000..8250123f
--- /dev/null
+++ b/samples/opensource/sampleMovieLensMPS/sampleMovieLensMPS.cpp
@@ -0,0 +1,778 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <algorithm>
+#include <atomic>
+#include <cmath>
+#include <cstring>
+#include <ctime>
+#include <cuda_profiler_api.h>
+#include <cuda_runtime_api.h>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <map>
+#include <sstream>
+#include <sys/stat.h>
+#include <vector>
+
+// Required to enable MPS Support
+#include <fcntl.h>
+#include <semaphore.h>
+#include <stdio.h>
+
+#ifndef _MSC_VER
+#include <sched.h>
+#include <stdlib.h>
+#include <sys/ipc.h>
+#include <sys/mman.h>
+#include <sys/shm.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#endif
+
+#include "NvInfer.h"
+#include "NvUffParser.h"
+#include "common.h"
+#include "logger.h"
+
+using namespace nvinfer1;
+using namespace nvuffparser;
+
+const std::string gSampleName = "TensorRT.sample_movielens_mps";
+
+// constants that are known about the MovieLens (NCF) MLP network.
+static const int32_t NUM_USERS{32};          // Total number of users.
+static const int32_t TOPK_MOVIES{1};         // The output of the topK layer for MovieLens sample.
+static const int32_t NUM_INDICES{100};       // Total numbers of Movies to predict per user.
+static const int32_t EMBEDDING_VEC_SIZE{32}; // Embedding vector size of each user and item.
+static const int32_t THREADS{1};
+static const char* USER_BLOB_NAME{"user_input"};  // user input blob name.
+static const char* ITEM_BLOB_NAME{"item_input"};  // item input blob name.
+static const char* TOPK_ITEM_PROB{"topk_values"}; // predicted item probability blob name.
+static const char* TOPK_ITEM_NAME{"topk_items"};  // predicted item probability blob name.
+static const char* RATING_INPUT_FILE{
+    "movielens_ratings.txt"}; // The default input file with 50 users and groundtruth data.
+static const char* DEFAULT_WEIGHT_FILE{"sampleMovieLens.wts2"}; // The weight file produced from README.txt
+static const char* UFF_MODEL_FILE{"sampleMovieLens.uff"};
+static const char* UFF_OUTPUT_NODE{"prediction/Sigmoid"};
+static const char* ENGINE_FILE{"sampleMovieLens.engine"};
+static const int32_t DEVICE{0};
+static const std::vector<std::string> directories{"data/samples/movielens/", "data/movielens/"};
+
+template <typename T>
+using SampleUniquePtr = std::unique_ptr<T, samplesCommon::InferDeleter>;
+
+class Semaphore
+{
+public:
+    Semaphore(const char* semName)
+        : mSemName(semName)
+    {
+    }
+
+    ~Semaphore()
+    {
+        sem_unlink(mSemName);
+        sem_close(mSemEngine);
+    }
+
+    void wait()
+    {
+        sem_wait(mSemEngine);
+    }
+
+    void post()
+    {
+        sem_post(mSemEngine);
+    }
+
+    void open()
+    {
+        mSemEngine = sem_open(mSemName, O_CREAT | O_EXCL, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP, 0);
+        if (mSemEngine == SEM_FAILED)
+        {
+            throw std::runtime_error("Could not create semaphore");
+        }
+    }
+
+private:
+    const char* mSemName;
+    sem_t* mSemEngine;
+};
+
+class SharedMemory
+{
+public:
+    SharedMemory(const char* modelStreamFd)
+        : mModelStreamFd(modelStreamFd)
+    {
+    }
+
+    ~SharedMemory()
+    {
+        shm_unlink(mModelStreamFd);
+    }
+
+    int open_ro()
+    {
+        return open(O_RDONLY, 0666);
+    }
+
+    int open_rw()
+    {
+        return open(O_RDWR | O_CREAT, 0666);
+    }
+
+private:
+    int open(int flag, mode_t mode)
+    {
+        int fd = shm_open(mModelStreamFd, flag, mode);
+        if (fd <= 0)
+        {
+            throw std::runtime_error("Could not create file descriptor: /dev/shm" + std::string(mModelStreamFd));
+        }
+        return fd;
+    }
+    const char* mModelStreamFd;
+};
+
+// The OutptutArgs struct holds intermediate/final outputs generated by the MovieLens structure per user.
+struct OutputArgs
+{
+    int32_t userId;                           // The user Id per batch.
+    int32_t expectedPredictedMaxRatingItem;   // The Expected Max Rating Item per user (inference ground truth).
+    float expectedPredictedMaxRatingItemProb; // The Expected Max Rating Probability. (inference ground truth).
+    std::vector<int32_t> allItems;            // All inferred items per user.
+    std::vector<std::pair<int32_t, float>> itemProbPairVec; // Expected topK items and prob per user.
+};                                                          // struct pargs
+
+struct Args
+{
+    int32_t embeddingVecSize{EMBEDDING_VEC_SIZE};
+    int32_t numUsers{NUM_USERS};                 // Total number of users. Should be equal to ratings file users count.
+    int32_t topKMovies{TOPK_MOVIES};             // TopK movies per user.
+    int32_t numMoviesPerUser{NUM_INDICES};       // The number of movies per user.
+    int32_t nbProcesses{THREADS};                // Number of concurrent processes
+    std::string weightFile{DEFAULT_WEIGHT_FILE}; // Weight file (.wts2) format Movielens sample.
+    std::string ratingInputFile{RATING_INPUT_FILE}; // The input rating file.
+    std::string uffFile{UFF_MODEL_FILE};
+    std::string engineFile{ENGINE_FILE};
+    bool enableFP16{false};    // Enable ability to run in FP16 mode.
+    bool enableInt8{false};    // Enable ability to run in Int8 mode.
+    bool enableVerbose{false}; // Set reportable severity of logger to kVERBOSE.
+    bool help{false};          // Print help info.
+    int useDLACore{-1};
+    // The below structures are used to compare the predicted values to inference (ground truth)
+    std::map<int32_t, std::vector<int32_t>> userToItemsMap; // Lookup for inferred items for each user.
+    std::map<int32_t, std::vector<std::pair<int32_t, float>>>
+        userToExpectedItemProbMap; // Lookup for topK items and probs for each user.
+    int32_t device{DEVICE};
+    std::vector<OutputArgs> pargsVec;
+    std::atomic<int32_t> failCount; // Number threads that failed inference.
+};                                  // struct args
+
+struct Batch
+{
+    Batch(ICudaEngine* engine, void* userInputPtr, void* itemInputPtr, const Args& args)
+    {
+        mEngine = engine;
+        mContext = SampleUniquePtr<IExecutionContext>(mEngine->createExecutionContext());
+
+        CHECK(cudaStreamCreate(&mStream));
+
+        // In order to bind the buffers, we need to know the names of the input and output tensors.
+        // note that indices are guaranteed to be less than IEngine::getNbBindings()
+        int userInputIndex = mEngine->getBindingIndex(USER_BLOB_NAME);
+        int itemInputIndex = mEngine->getBindingIndex(ITEM_BLOB_NAME);
+        int outputPredictionIndex = mEngine->getBindingIndex(UFF_OUTPUT_NODE);
+        int outputItemProbIndex = mEngine->getBindingIndex(TOPK_ITEM_PROB);
+        int outputItemNameIndex = mEngine->getBindingIndex(TOPK_ITEM_NAME);
+
+        mMemSizes.push_back(args.numUsers * args.numMoviesPerUser * sizeof(float));
+        mMemSizes.push_back(args.numUsers * args.numMoviesPerUser * sizeof(float));
+        mMemSizes.push_back(args.numUsers * args.numMoviesPerUser * sizeof(float));
+        mMemSizes.push_back(args.numUsers * args.topKMovies * sizeof(float));
+        mMemSizes.push_back(args.numUsers * args.topKMovies * sizeof(float));
+
+        CHECK(cudaMallocHost(&mHostMemory[userInputIndex], mMemSizes[userInputIndex]));
+        CHECK(cudaMallocHost(&mHostMemory[itemInputIndex], mMemSizes[itemInputIndex]));
+        CHECK(cudaMallocHost(&mHostMemory[outputPredictionIndex], mMemSizes[outputPredictionIndex]));
+        CHECK(cudaMallocHost(&mHostMemory[outputItemProbIndex], mMemSizes[outputItemProbIndex]));
+        CHECK(cudaMallocHost(&mHostMemory[outputItemNameIndex], mMemSizes[outputItemNameIndex]));
+
+        // copy the data to host memory
+        for (unsigned int i = 0; i < (mMemSizes[userInputIndex]) / sizeof(float); ++i)
+        {
+            *(static_cast<uint32_t*>(mHostMemory[userInputIndex]) + i) = *((uint32_t*) userInputPtr + i);
+        }
+        for (unsigned int i = 0; i < (mMemSizes[itemInputIndex]) / sizeof(float); ++i)
+        {
+            *(static_cast<uint32_t*>(mHostMemory[itemInputIndex]) + i) = *((uint32_t*) itemInputPtr + i);
+        }
+
+        // allocate GPU memory
+        CHECK(cudaMalloc(&mDeviceMemory[userInputIndex], mMemSizes[userInputIndex]));
+        CHECK(cudaMalloc(&mDeviceMemory[itemInputIndex], mMemSizes[itemInputIndex]));
+        CHECK(cudaMalloc(&mDeviceMemory[outputPredictionIndex], mMemSizes[outputPredictionIndex]));
+        CHECK(cudaMalloc(&mDeviceMemory[outputItemProbIndex], mMemSizes[outputItemProbIndex]));
+        CHECK(cudaMalloc(&mDeviceMemory[outputItemNameIndex], mMemSizes[outputItemNameIndex]));
+    }
+
+    ~Batch()
+    {
+        for (auto p : mHostMemory)
+            CHECK(cudaFreeHost(p));
+        for (auto p : mDeviceMemory)
+            CHECK(cudaFree(p));
+        CHECK(cudaStreamDestroy(mStream));
+    }
+
+    ICudaEngine* mEngine;
+    SampleUniquePtr<nvinfer1::IExecutionContext> mContext;
+    cudaStream_t mStream;
+    void* mHostMemory[5];
+    void* mDeviceMemory[5];
+    std::vector<size_t> mMemSizes;
+};
+
+void printHelpInfo()
+{
+    std::cout
+        << "Usage:\n"
+        << " ./sample_movielens_mps [-h or --help] [-b NUM_USERS] [-p NUM_PROCESSES] [--useDLACore=<int>] [--verbose]\n"
+        << "-h             Display help information. All single dash options enable perf mode.\n"
+        << "-b             Number of Users i.e. Batch Size (default numUsers=32).\n"
+        << "-p             Number of child processes to launch (default nbProcesses=1. Using MPS with this option is "
+           "strongly recommended).\n"
+        << "--useDLACore=N Specify a DLA engine for layers that support DLA. Value can range from 0 to n-1, where n is "
+           "the number of DLA engines on the platform.\n"
+        << "--verbose      Enable verbose prints.\n"
+        << "--int8         Run in Int8 mode.\n"
+        << "--fp16         Run in FP16 mode.\n"
+        << std::endl;
+}
+
+// Parse the arguments and return failure if arguments are incorrect
+bool parseArgs(Args& args, int argc, char* argv[])
+{
+    for (int i = 1; i < argc; ++i)
+    {
+        std::string argStr(argv[i]);
+
+        if (argStr == "-h" || argStr == "--help")
+        {
+            args.help = true;
+            return true;
+        }
+        if (argStr == "-b")
+        {
+            i++;
+            args.numUsers = std::atoi(argv[i]);
+        }
+        else if (argStr == "-p")
+        {
+            i++;
+            args.nbProcesses = std::atoi(argv[i]);
+        }
+        else if (argStr == "--verbose")
+        {
+            args.enableVerbose = true;
+            setReportableSeverity(Severity::kVERBOSE);
+        }
+        else if (argStr.compare(0, 13, "--useDLACore=") == 0 && argStr.size() > 13)
+        {
+            args.useDLACore = std::stoi(argv[i] + 13);
+        }
+        else if (argStr == "--int8")
+        {
+            args.enableInt8 = true;
+        }
+        else if (argStr == "--fp16")
+        {
+            args.enableFP16 = true;
+        }
+        else
+        {
+            return false;
+        }
+    }
+    return true;
+}
+
+void printOutputArgs(OutputArgs& pargs)
+{
+    gLogVerbose << "User Id                            :   " << pargs.userId << std::endl;
+    gLogVerbose << "Expected Predicted Max Rating Item :   " << pargs.expectedPredictedMaxRatingItem << std::endl;
+    gLogVerbose << "Expected Predicted Max Rating Prob :   " << pargs.expectedPredictedMaxRatingItemProb << std::endl;
+    gLogVerbose << "Total TopK Items : " << pargs.itemProbPairVec.size() << std::endl;
+    for (unsigned i = 0; i < pargs.itemProbPairVec.size(); ++i)
+        gLogVerbose << pargs.itemProbPairVec.at(i).first << " : " << pargs.itemProbPairVec.at(i).second << std::endl;
+}
+
+std::string readNextLine(std::ifstream& file, char delim)
+{
+    std::string line;
+    std::getline(file, line);
+    auto pos = line.find(delim);
+    line = line.substr(pos + 1);
+    return line;
+}
+
+void readInputSample(std::ifstream& file, OutputArgs& pargs, std::string line, const Args& args)
+{
+    // read user name
+    char delim = ':';
+    auto pos = line.find(delim);
+    line = line.substr(pos + 1);
+    pargs.userId = std::stoi(line);
+    // read items
+    std::string items = readNextLine(file, delim);
+    items = items.substr(2, items.size() - 2);
+    std::stringstream ss(items);
+    std::string i;
+    while (ss >> i)
+    {
+        if (ss.peek() == ',' || ss.peek() == ' ')
+            ss.ignore();
+        i = i.substr(0, i.size() - 1);
+        pargs.allItems.push_back(std::stoi(i));
+    }
+
+    // read expected predicted max rating item
+    pargs.expectedPredictedMaxRatingItem = std::stoi(readNextLine(file, delim));
+
+    // read expected predicted max rating prob
+    std::string prob = readNextLine(file, delim);
+    prob = prob.substr(2, prob.size() - 3);
+    pargs.expectedPredictedMaxRatingItemProb = std::stof(prob);
+
+    // skip line
+    std::getline(file, line);
+    std::getline(file, line);
+
+    // read all the top 10 prediction ratings
+    for (int i = 0; i < 10; ++i)
+    {
+        auto pos = line.find(delim);
+        int32_t item = std::stoi(line.substr(0, pos - 1));
+        float prob = std::stof(line.substr(pos + 2));
+        pargs.itemProbPairVec.emplace_back((std::make_pair(item, prob)));
+        std::getline(file, line);
+    }
+}
+
+void parseMovieLensData(Args& args)
+{
+    std::ifstream file;
+    file.open(args.ratingInputFile);
+    std::string line;
+    int userIdx = 0;
+    while (std::getline(file, line) && userIdx < args.numUsers)
+    {
+        OutputArgs pargs;
+        readInputSample(file, pargs, line, args);
+
+        // store the pargs in the global data structure. Hack.
+        args.pargsVec.push_back(pargs);
+
+        args.userToItemsMap[userIdx] = std::move(pargs.allItems);
+        args.userToExpectedItemProbMap[userIdx] = std::move(pargs.itemProbPairVec);
+
+        userIdx++;
+        printOutputArgs(pargs);
+    }
+
+    // number of users should be equal to number of users in rating file
+    if (args.numUsers != userIdx)
+    {
+        throw std::runtime_error("Invalid ratings file.");
+    }
+}
+
+template <typename T1, typename T2>
+bool printInferenceOutput(
+    void* userInputPtr, void* itemInputPtr, void* topKItemNumberPtr, void* topKItemProbPtr, const Args& args)
+{
+    bool pass{true};
+    T1* userInput{static_cast<T1*>(userInputPtr)};
+    T1* topKItemNumber{static_cast<T1*>(topKItemNumberPtr)};
+    T2* topKItemProb{static_cast<T2*>(topKItemProbPtr)};
+
+    gLogInfo << "Num of users : " << args.numUsers << std::endl;
+    gLogInfo << "Num of Movies : " << args.numMoviesPerUser << std::endl;
+
+    gLogVerbose << "|-----------|------------|-----------------|-----------------|" << std::endl;
+    gLogVerbose << "|   User    |   Item     |  Expected Prob  |  Predicted Prob |" << std::endl;
+    gLogVerbose << "|-----------|------------|-----------------|-----------------|" << std::endl;
+
+    for (int i = 0; i < args.numUsers; ++i)
+    {
+        int userIdx = userInput[i * args.numMoviesPerUser];
+        int maxPredictedIdx = topKItemNumber[i * args.topKMovies];
+        int maxExpectedItem = args.userToExpectedItemProbMap.at(userIdx).at(0).first;
+        int maxPredictedItem = args.userToItemsMap.at(userIdx).at(maxPredictedIdx);
+        pass &= (maxExpectedItem == maxPredictedItem);
+
+        for (int k = 0; k < args.topKMovies; ++k)
+        {
+            int predictedIdx = topKItemNumber[i * args.topKMovies + k];
+            float predictedProb = topKItemProb[i * args.topKMovies + k];
+            float expectedProb = args.userToExpectedItemProbMap.at(userIdx).at(k).second;
+            int predictedItem = args.userToItemsMap.at(userIdx).at(predictedIdx);
+            gLogVerbose << "|" << std::setw(10) << userIdx << " | " << std::setw(10) << predictedItem << " | "
+                        << std::setw(15) << expectedProb << " | " << std::setw(15) << predictedProb << " | "
+                        << std::endl;
+        }
+    }
+
+    for (int i = 0; i < args.numUsers; ++i)
+    {
+        int userIdx = userInput[i * args.numMoviesPerUser];
+        int maxPredictedIdx = topKItemNumber[i * args.topKMovies];
+        int maxExpectedItem = args.userToExpectedItemProbMap.at(userIdx).at(0).first;
+        int maxPredictedItem = args.userToItemsMap.at(userIdx).at(maxPredictedIdx);
+        gLogInfo << "| PID : " << std::setw(4) << getpid() << " | User :" << std::setw(4) << userIdx
+                 << "  |  Expected Item :" << std::setw(5) << maxExpectedItem << "  |  Predicted Item :" << std::setw(5)
+                 << maxPredictedItem << " | " << std::endl;
+    }
+
+    return pass;
+}
+
+void submitWork(Batch& b, const Args& args)
+{
+    int userInputIndex = b.mEngine->getBindingIndex(USER_BLOB_NAME);
+    int itemInputIndex = b.mEngine->getBindingIndex(ITEM_BLOB_NAME);
+    int outputPredictionIndex = b.mEngine->getBindingIndex(UFF_OUTPUT_NODE);
+    int outputItemProbIndex = b.mEngine->getBindingIndex(TOPK_ITEM_PROB);
+    int outputItemNameIndex = b.mEngine->getBindingIndex(TOPK_ITEM_NAME);
+
+    // Copy input from host to device
+    CHECK(cudaMemcpyAsync(b.mDeviceMemory[userInputIndex], b.mHostMemory[userInputIndex], b.mMemSizes[userInputIndex],
+        cudaMemcpyHostToDevice, b.mStream));
+    CHECK(cudaMemcpyAsync(b.mDeviceMemory[itemInputIndex], b.mHostMemory[itemInputIndex], b.mMemSizes[itemInputIndex],
+        cudaMemcpyHostToDevice, b.mStream));
+
+    b.mContext->enqueue(args.numUsers, b.mDeviceMemory, b.mStream, nullptr);
+
+    // copy output from device to host
+    CHECK(cudaMemcpyAsync(b.mHostMemory[outputPredictionIndex], b.mDeviceMemory[outputPredictionIndex],
+        b.mMemSizes[outputPredictionIndex], cudaMemcpyDeviceToHost, b.mStream));
+    CHECK(cudaMemcpyAsync(b.mHostMemory[outputItemProbIndex], b.mDeviceMemory[outputItemProbIndex],
+        b.mMemSizes[outputItemProbIndex], cudaMemcpyDeviceToHost, b.mStream));
+    CHECK(cudaMemcpyAsync(b.mHostMemory[outputItemNameIndex], b.mDeviceMemory[outputItemNameIndex],
+        b.mMemSizes[outputItemNameIndex], cudaMemcpyDeviceToHost, b.mStream));
+}
+
+std::shared_ptr<ICudaEngine> loadModelAndCreateEngine(const char* uffFile, IUffParser* parser, const Args& args)
+{
+    // Create the builder
+    auto builder = SampleUniquePtr<nvinfer1::IBuilder>(nvinfer1::createInferBuilder(gLogger.getTRTLogger()));
+    if (builder == nullptr)
+    {
+        throw std::runtime_error("Could not create builder.");
+    }
+
+    auto network = SampleUniquePtr<nvinfer1::INetworkDefinition>(builder->createNetwork());
+    if (network == nullptr)
+    {
+        throw std::runtime_error("Could not create network.");
+    }
+
+    auto config = SampleUniquePtr<nvinfer1::IBuilderConfig>(builder->createBuilderConfig());
+    if (config == nullptr)
+    {
+        throw std::runtime_error("Could not create network.");
+    }
+
+    gLogInfo << "Begin parsing model..." << std::endl;
+
+    auto dType = args.enableFP16 ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT;
+
+    // Parse the uff model to populate the network
+    if (!parser->parse(uffFile, *network, dType))
+    {
+        gLogError << "Failure while parsing UFF file" << std::endl;
+        return nullptr;
+    }
+
+    gLogInfo << "End parsing model..." << std::endl;
+
+    // Add postprocessing i.e. topk layer to the UFF Network
+    // Retrieve last layer of UFF Network
+    auto uffLastLayer = network->getLayer(network->getNbLayers() - 1);
+
+    // Reshape output of fully connected layer numOfMovies x 1 x 1 x 1 to numOfMovies x 1 x 1.
+    auto reshapeLayer = network->addShuffle(*uffLastLayer->getOutput(0));
+    reshapeLayer->setReshapeDimensions(Dims3{1, args.numMoviesPerUser, 1});
+    if (reshapeLayer == nullptr)
+    {
+        throw std::runtime_error("Could not create reshape layer.");
+    }
+
+    // Apply TopK layer to retrieve item probabilities and corresponding index number.
+    auto topK = network->addTopK(*reshapeLayer->getOutput(0), TopKOperation::kMAX, args.topKMovies, 0x2);
+    if (topK == nullptr)
+    {
+        throw std::runtime_error("Could not create TopK layer.");
+    }
+
+    // Mark outputs for index and probs. Also need to set the item layer type == kINT32.
+    topK->getOutput(0)->setName(TOPK_ITEM_PROB);
+    topK->getOutput(1)->setName(TOPK_ITEM_NAME);
+
+    // Specify topK tensors as outputs
+    network->markOutput(*topK->getOutput(0));
+    network->markOutput(*topK->getOutput(1));
+
+    // Set the topK indices tensor as INT32 type
+    topK->getOutput(1)->setType(DataType::kINT32);
+
+    // Build the engine
+    builder->setMaxBatchSize(args.numUsers);
+    config->setMaxWorkspaceSize(1_GiB); // The _GiB literal operator is defined in common.h
+    if (args.enableFP16)
+    {
+        config->setFlag(BuilderFlag::kFP16);
+    }
+    if (args.enableInt8)
+    {
+        config->setFlag(BuilderFlag::kINT8);
+    }
+
+    samplesCommon::setDummyInt8Scales(config.get(), network.get());
+    samplesCommon::enableDLA(builder.get(), config.get(), args.useDLACore);
+
+    auto engine = samplesCommon::infer_object(builder->buildEngineWithConfig(*network, *config));
+    if (!engine)
+    {
+        gLogError << "Unable to create engine" << std::endl;
+        return nullptr;
+    }
+
+    gLogInfo << "End building engine..." << std::endl;
+    return engine;
+}
+
+bool doInference(void* modelStreamData, int modelStreamSize, void* userInputPtr, void* itemInputPtr, Args& args)
+{
+    auto runtime = SampleUniquePtr<nvinfer1::IRuntime>(nvinfer1::createInferRuntime(gLogger.getTRTLogger()));
+    if (args.useDLACore >= 0)
+    {
+        runtime->setDLACore(args.useDLACore);
+    }
+
+    auto engine
+        = samplesCommon::infer_object(runtime->deserializeCudaEngine(modelStreamData, modelStreamSize, nullptr));
+
+    Batch b{engine.get(), userInputPtr, itemInputPtr, args};
+
+    {
+        samplesCommon::GpuTimer timer{b.mStream};
+        timer.start();
+        // Run inference for all the nbProcesses
+        submitWork(b, args);
+        cudaStreamSynchronize(b.mStream);
+        timer.stop();
+        gLogInfo << "Done execution in process: " << getpid() << " . Duration : " << timer.microseconds()
+                 << " microseconds." << std::endl;
+    }
+
+    int outputItemProbIndex = b.mEngine->getBindingIndex(TOPK_ITEM_PROB);
+    int outputItemNameIndex = b.mEngine->getBindingIndex(TOPK_ITEM_NAME);
+
+    float* topKItemProb = static_cast<float*>(b.mHostMemory[outputItemProbIndex]);
+    uint32_t* topKItemNumber = static_cast<uint32_t*>(b.mHostMemory[outputItemNameIndex]);
+    bool pass{printInferenceOutput<uint32_t, float>(userInputPtr, itemInputPtr, topKItemNumber, topKItemProb, args)};
+
+    return pass;
+}
+
+int mainMovieLensMPS(Args& args, OutputArgs& pargs)
+{
+    // Parse the ratings file and populate ground truth data
+    args.ratingInputFile = locateFile(args.ratingInputFile, directories);
+    gLogInfo << args.ratingInputFile << std::endl;
+
+    // Parse ground truth data and inputs, common to all processes (if using MPS)
+    parseMovieLensData(args);
+
+    // Create uff parser
+    args.uffFile = locateFile(args.uffFile, directories);
+    auto parser = SampleUniquePtr<nvuffparser::IUffParser>(nvuffparser::createUffParser());
+
+    // All nbProcesses should wait until the parent is done building the engine.
+    Semaphore sem("/engine_built");
+    sem.open();
+
+    pid_t pid{};
+    // Create child processes
+    for (int i = 0; i < args.nbProcesses; ++i)
+    {
+        pid = fork();
+        // Children should not create additional processes.
+        if (pid == 0)
+        {
+            break;
+        }
+        else if (pid == -1)
+        {
+            throw std::runtime_error("Could not create child process");
+        }
+    }
+    // Every process needs to know if it's a child or not.
+    bool isParentProcess = (pid != 0);
+
+    SharedMemory shm("/sampleMovieLens.modelStream");
+
+    if (isParentProcess)
+    {
+        // Parent process should build an engine and write it to the shared buffer.
+        Dims inputIndices;
+        inputIndices.nbDims = 3;
+        inputIndices.d[0] = args.numMoviesPerUser;
+        inputIndices.d[1] = 1;
+        inputIndices.d[2] = 1;
+
+        parser->registerInput(USER_BLOB_NAME, inputIndices, UffInputOrder::kNCHW);
+        parser->registerInput(ITEM_BLOB_NAME, inputIndices, UffInputOrder::kNCHW);
+        parser->registerOutput(UFF_OUTPUT_NODE);
+
+        auto engine = loadModelAndCreateEngine(args.uffFile.c_str(), parser.get(), args);
+        if (engine.get() == nullptr)
+        {
+            throw std::runtime_error("Failed to create engine.");
+        }
+
+        auto modelStream = samplesCommon::infer_object(engine->serialize());
+
+        size_t modelStreamSize = modelStream->size();
+        // Create a shared buffer for the modelStream.
+        int fd = shm.open_rw();
+
+        fallocate(fd, 0, 0, modelStreamSize);
+        void* modelStreamData = mmap(NULL, modelStreamSize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+        // Copy modelStream to the shared buffer.
+        std::memcpy(modelStreamData, modelStream->data(), modelStreamSize);
+        // Clean up.
+        close(fd);
+    }
+    else
+    {
+        // Allocate input and output buffers on host.
+        std::vector<uint32_t> userInput(args.numUsers * args.numMoviesPerUser * sizeof(float));
+        std::vector<uint32_t> itemInput(args.numUsers * args.numMoviesPerUser * sizeof(float));
+
+        for (int i = 0; i < args.numUsers; ++i)
+        {
+            for (int k = 0; k < args.numMoviesPerUser; ++k)
+            {
+                int idx = i * args.numMoviesPerUser + k;
+                userInput[idx] = args.pargsVec[i].userId;
+                itemInput[idx] = args.pargsVec[i].allItems.at(k);
+            }
+        }
+
+        // Now wait for parent to construct engine and write the modelstream to the shared buffer.
+        sem.wait();
+
+        // Open a file descriptor for the shared buffer.
+        int fd = shm.open_ro();
+
+        // Get size of shared memory buffer.
+        struct stat sb;
+        fstat(fd, &sb);
+        int modelStreamSize = sb.st_size;
+        if (modelStreamSize <= 0)
+        {
+            throw std::runtime_error("Failed to fetch model stream from shared memory buffer.");
+        }
+
+        // Retrieve the modelStream and close the file descriptor.
+        void* modelStreamData = mmap(NULL, modelStreamSize, PROT_READ, MAP_SHARED, fd, 0);
+        close(fd);
+
+        // All child processes will do inference and then exit.
+        bool pass = doInference(modelStreamData, modelStreamSize, userInput.data(), itemInput.data(), args);
+        if (!pass)
+            args.failCount++;
+
+        exit(0);
+    }
+
+    // Let children processes continue
+    for (int j = 0; j < args.nbProcesses; ++j)
+    {
+        sem.post();
+    }
+
+    // Then time them.
+    {
+        samplesCommon::PreciseCpuTimer timer{};
+        timer.start();
+        int status;
+        // Parent should wait for child processes.
+        for (int i = 0; i < args.nbProcesses; ++i)
+        {
+            wait(&status);
+        }
+        timer.stop();
+        gLogInfo << "Number of processes executed : " << args.nbProcesses
+                 << ". Total MPS Run Duration : " << timer.milliseconds() << " milliseconds." << std::endl;
+    }
+
+    bool pass = !args.failCount;
+    return pass;
+}
+
+int main(int argc, char* argv[])
+{
+    Args args;        // Global struct to store arguments
+    OutputArgs pargs; // Ratings file struct
+
+    // Parse arguments
+    bool argsOK = parseArgs(args, argc, argv);
+    args.failCount = 0;
+
+    if (args.help)
+    {
+        printHelpInfo();
+        return EXIT_SUCCESS;
+    }
+
+    if (!argsOK)
+    {
+        printHelpInfo();
+        gLogError << "Invalid arguments" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    auto sampleTest = gLogger.defineTest(gSampleName, argc, argv);
+    gLogger.reportTestStart(sampleTest);
+    bool pass = false;
+
+    try
+    {
+        pass = mainMovieLensMPS(args, pargs);
+    }
+    catch (const std::exception& e)
+    {
+        gLogError << e.what() << std::endl;
+    }
+    return gLogger.reportTest(sampleTest, pass);
+}
diff --git a/samples/opensource/sampleMovieLensMPS/sampleMovieLensTraining.patch b/samples/opensource/sampleMovieLensMPS/sampleMovieLensTraining.patch
new file mode 100644
index 00000000..b52edd47
--- /dev/null
+++ b/samples/opensource/sampleMovieLensMPS/sampleMovieLensTraining.patch
@@ -0,0 +1,419 @@
+Copyright 1993-2019 NVIDIA Corporation.  All rights reserved.
+
+NOTICE TO LICENSEE:
+
+This source code and/or documentation ("Licensed Deliverables") are subject to
+NVIDIA intellectual property rights under U.S. and international Copyright
+laws.
+
+These Licensed Deliverables contained herein is PROPRIETARY and CONFIDENTIAL
+to NVIDIA and is being provided under the terms and conditions of a form of
+NVIDIA software license agreement by and between NVIDIA and Licensee ("License
+Agreement") or electronically accepted by Licensee.  Notwithstanding any terms
+or conditions to the contrary in the License Agreement, reproduction or
+disclosure of the Licensed Deliverables to any third party without the express
+written consent of NVIDIA is prohibited.
+
+NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE LICENSE
+AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THESE
+LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS
+OR IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD
+TO THESE LICENSED DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF
+MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE LICENSE
+AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT,
+INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM
+LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+PERFORMANCE OF THESE LICENSED DELIVERABLES.
+
+U.S. Government End Users.  These Licensed Deliverables are a "commercial
+item" as that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
+"commercial computer software" and "commercial computer software
+documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) and is
+provided to the U.S. Government only as a commercial end item.  Consistent
+with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995),
+all U.S. Government End Users acquire the Licensed Deliverables with only
+those rights set forth herein.
+
+Any use of the Licensed Deliverables in individual and commercial software
+must include, in the user documentation and internal comments to the code, the
+above Disclaimer and U.S. Government End Users Notice.
+
+diff --git a/MLP.py b/MLP.py
+index 70566c7..93c0d53 100644
+--- a/MLP.py
++++ b/MLP.py
+@@ -1,30 +1,27 @@
+ '''
+ Created on Aug 9, 2016
+ Keras Implementation of Multi-Layer Perceptron (GMF) recommender model in:
+-He Xiangnan et al. Neural Collaborative Filtering. In WWW 2017.
++He Xiangnan et al. Neural Collaborative Filtering. In WWW 2017.
+
+ @author: Xiangnan He (xiangnanhe@gmail.com)
+ '''
+-
++import shutil
+ import numpy as np
++import tensorflow as tf
+
+-import theano
+-import theano.tensor as T
+-import keras
+-from keras import backend as K
+-from keras import initializations
+-from keras.regularizers import l2, activity_l2
+-from keras.models import Sequential, Graph, Model
+-from keras.layers.core import Dense, Lambda, Activation
+-from keras.layers import Embedding, Input, Dense, merge, Reshape, Merge, Flatten, Dropout
+-from keras.constraints import maxnorm
+-from keras.optimizers import Adagrad, Adam, SGD, RMSprop
++from tensorflow import keras
++from tensorflow.python.keras import initializers
++from tensorflow.python.keras.regularizers import l2
++from tensorflow.python.keras.models import Model
++from tensorflow.python.keras.layers import Dense, Embedding, Input, Flatten, \
++     concatenate
++from tensorflow.python.keras.optimizers import Adam, Adagrad, SGD, RMSprop
++from tensorflow.python.framework import graph_util
+ from evaluate import evaluate_model
++from evaluate import infer_model
+ from Dataset import Dataset
+ from time import time
+-import sys
+ import argparse
+-import multiprocessing as mp
+
+ #################### Arguments ####################
+ def parse_args():
+@@ -54,7 +51,50 @@ def parse_args():
+     return parser.parse_args()
+
+ def init_normal(shape, name=None):
+-    return initializations.normal(shape, scale=0.01, name=name)
++    return initializers.he_normal()
++
++def freeze_checkpoint_graph(output_node_names, checkpoint_model_folder, output_graph_filename):
++    # retrieve the checkpoint fullpath
++    checkpoint = tf.train.get_checkpoint_state(checkpoint_model_folder)
++    input_checkpoint = checkpoint.model_checkpoint_path
++
++    print(input_checkpoint)
++    # precise the file fullname of our freezed graph
++    absolute_model_folder = "/".join(input_checkpoint.split("/")[:-1])
++
++    # we clear devices, to allow tensorflow to control on the loading, where it wants operations to be calculated
++    clear_devices = True
++
++    # the checkpoint directory has - .meta and .data i.e. weights file to be retrieved
++    saver = tf.train.import_meta_graph(input_checkpoint + '.meta', clear_devices=clear_devices)
++
++    # retrieve protobuf graph definition
++    # returns the default graph of the current thread - will be the innermost graph
++    # on which Graph.as_default() context has been entered - global_default_graph if non has been explicitly created
++    graph = tf.get_default_graph()
++
++    # retrieve graph def for a grpah
++    input_graph_def = graph.as_graph_def()
++
++    # print the output nodes
++    output_node_list = [n.name for n in tf.get_default_graph().as_graph_def().node]
++
++    # start the session and restore the weights
++    with tf.Session() as sess:
++        saver.restore(sess, input_checkpoint)
++
++        # in order to freeze the graph - need to export the variables to constants
++        output_graph_def = graph_util.convert_variables_to_constants(
++                sess,   # session have weights stored
++                input_graph_def,
++                output_node_names.split(",")
++        )
++
++        # finally we serialize and dump the output graph to the filesystem
++        with tf.gfile.GFile(output_graph_filename, "wb") as f:
++            f.write(output_graph_def.SerializeToString())
++
++        print("[FREEZE_INFO] ", len(output_graph_def.node), " ops in the final graph.")
+
+ def get_model(num_users, num_items, layers = [20,10], reg_layers=[0,0]):
+     assert len(layers) == len(reg_layers)
+@@ -63,29 +103,43 @@ def get_model(num_users, num_items, layers = [20,10], reg_layers=[0,0]):
+     user_input = Input(shape=(1,), dtype='int32', name = 'user_input')
+     item_input = Input(shape=(1,), dtype='int32', name = 'item_input')
+
+-    MLP_Embedding_User = Embedding(input_dim = num_users, output_dim = layers[0]/2, name = 'user_embedding',
+-                                  init = init_normal, W_regularizer = l2(reg_layers[0]), input_length=1)
+-    MLP_Embedding_Item = Embedding(input_dim = num_items, output_dim = layers[0]/2, name = 'item_embedding',
+-                                  init = init_normal, W_regularizer = l2(reg_layers[0]), input_length=1)
+-
++    MLP_Embedding_User = Embedding(input_dim=num_users,
++                                   output_dim=int(layers[0] // 2),
++                                   name='user_embedding',
++                                   embeddings_initializer='random_uniform',
++                                   embeddings_regularizer=l2(reg_layers[0]),
++                                   input_length=1)
++    MLP_Embedding_Item = Embedding(input_dim=num_items,
++                                   output_dim=int(layers[0] // 2),
++                                   name='item_embedding',
++                                   embeddings_initializer='random_uniform',
++                                   embeddings_regularizer=l2(reg_layers[0]),
++                                   input_length=1)
+     # Crucial to flatten an embedding vector!
+     user_latent = Flatten()(MLP_Embedding_User(user_input))
+     item_latent = Flatten()(MLP_Embedding_Item(item_input))
+-
++
+     # The 0-th layer is the concatenation of embedding layers
+-    vector = merge([user_latent, item_latent], mode = 'concat')
+-
++    vector = concatenate([user_latent, item_latent])
++
+     # MLP layers
+-    for idx in xrange(1, num_layer):
+-        layer = Dense(layers[idx], W_regularizer= l2(reg_layers[idx]), activation='relu', name = 'layer%d' %idx)
++    for idx in range(1, num_layer):
++        print(idx, " : ", layers[idx])
++        layer = Dense(layers[idx],
++                      kernel_regularizer=l2(reg_layers[idx]),
++                      activation='relu',
++                      name='layer%d'%idx)
+         vector = layer(vector)
+-
++
+     # Final prediction layer
+-    prediction = Dense(1, activation='sigmoid', init='lecun_uniform', name = 'prediction')(vector)
+-
+-    model = Model(input=[user_input, item_input],
+-                  output=prediction)
+-
++    prediction = Dense(1,
++                       activation='sigmoid',
++                       kernel_initializer='lecun_uniform',
++                       name='prediction')(vector)
++
++    model = Model(inputs=[user_input, item_input],
++                  outputs=prediction)
++
+     return model
+
+ def get_train_instances(train, num_negatives):
+@@ -97,9 +151,10 @@ def get_train_instances(train, num_negatives):
+         item_input.append(i)
+         labels.append(1)
+         # negative instances
+-        for t in xrange(num_negatives):
++        for t in range(num_negatives):
+             j = np.random.randint(num_items)
+-            while train.has_key((u, j)):
++            #while train.has_key((u, j)):
++            while (u, j) in train:
+                 j = np.random.randint(num_items)
+             user_input.append(u)
+             item_input.append(j)
+@@ -118,61 +173,73 @@ if __name__ == '__main__':
+     batch_size = args.batch_size
+     epochs = args.epochs
+     verbose = args.verbose
+-
++
+     topK = 10
+     evaluation_threads = 1 #mp.cpu_count()
+     print("MLP arguments: %s " %(args))
+-    model_out_file = 'Pretrain/%s_MLP_%s_%d.h5' %(args.dataset, args.layers, time())
+-
++
+     # Loading data
+     t1 = time()
+     dataset = Dataset(args.path + args.dataset)
+     train, testRatings, testNegatives = dataset.trainMatrix, dataset.testRatings, dataset.testNegatives
+     num_users, num_items = train.shape
+-    print("Load data done [%.1f s]. #user=%d, #item=%d, #train=%d, #test=%d"
++    print("Load data done [%.1f s]. #user=%d, #item=%d, #train=%d, #test=%d"
+           %(time()-t1, num_users, num_items, train.nnz, len(testRatings)))
+-
++
+     # Build model
+     model = get_model(num_users, num_items, layers, reg_layers)
+-    if learner.lower() == "adagrad":
++    if learner.lower() == "adagrad":
+         model.compile(optimizer=Adagrad(lr=learning_rate), loss='binary_crossentropy')
+     elif learner.lower() == "rmsprop":
+         model.compile(optimizer=RMSprop(lr=learning_rate), loss='binary_crossentropy')
+     elif learner.lower() == "adam":
+         model.compile(optimizer=Adam(lr=learning_rate), loss='binary_crossentropy')
+     else:
+-        model.compile(optimizer=SGD(lr=learning_rate), loss='binary_crossentropy')
+-
++        model.compile(optimizer=SGD(lr=learning_rate), loss='binary_crossentropy')
++
+     # Check Init performance
+     t1 = time()
+     (hits, ndcgs) = evaluate_model(model, testRatings, testNegatives, topK, evaluation_threads)
+     hr, ndcg = np.array(hits).mean(), np.array(ndcgs).mean()
+     print('Init: HR = %.4f, NDCG = %.4f [%.1f]' %(hr, ndcg, time()-t1))
+-
++
++    saver = tf.train.Saver()
++
+     # Train model
+     best_hr, best_ndcg, best_iter = hr, ndcg, -1
+-    for epoch in xrange(epochs):
++    for epoch in range(epochs):
++        print("Training epochs : ", epoch)
+         t1 = time()
+         # Generate training instances
+         user_input, item_input, labels = get_train_instances(train, num_negatives)
+-
+-        # Training
++
++        # Training
+         hist = model.fit([np.array(user_input), np.array(item_input)], #input
+-                         np.array(labels), # labels
+-                         batch_size=batch_size, nb_epoch=1, verbose=0, shuffle=True)
++                         np.array(labels), # labels
++                         batch_size=batch_size, epochs=1, verbose=0, shuffle=True)
+         t2 = time()
+
+         # Evaluation
+         if epoch %verbose == 0:
+             (hits, ndcgs) = evaluate_model(model, testRatings, testNegatives, topK, evaluation_threads)
+             hr, ndcg, loss = np.array(hits).mean(), np.array(ndcgs).mean(), hist.history['loss'][0]
+-            print('Iteration %d [%.1f s]: HR = %.4f, NDCG = %.4f, loss = %.4f [%.1f s]'
++            print('Iteration %d [%.1f s]: HR = %.4f, NDCG = %.4f, loss = %.4f [%.1f s]'
+                   % (epoch,  t2-t1, hr, ndcg, loss, time()-t2))
+             if hr > best_hr:
+                 best_hr, best_ndcg, best_iter = hr, ndcg, epoch
+-                if args.out > 0:
+-                    model.save_weights(model_out_file, overwrite=True)
++    # Model is trained, all epochs are done, save the golden data
++    infer_model(model, testRatings, testNegatives, topK, evaluation_threads)
+
+     print("End. Best Iteration %d:  HR = %.4f, NDCG = %.4f. " %(best_iter, best_hr, best_ndcg))
+-    if args.out > 0:
+-        print("The best MLP model is saved to %s" %(model_out_file))
++    # Get keras session
++    save_path = saver.save(tf.keras.backend.get_session(), './ckpts/sampleMovieLens.ckpt')
++
++    output_node_names = "prediction/Sigmoid"
++    checkpoint_model_folder = "./ckpts/";
++    output_graph_filename = "sampleMovieLens.pb"
++
++    # convert checkpoints to frozen graph
++    freeze_checkpoint_graph(output_node_names, checkpoint_model_folder, output_graph_filename)
++
++    # delete checkpoints file
++    shutil.rmtree("./ckpts")
+diff --git a/evaluate.py b/evaluate.py
+index 729f07a..6079a8a 100644
+--- a/evaluate.py
++++ b/evaluate.py
+@@ -20,6 +20,71 @@ _testRatings = None
+ _testNegatives = None
+ _K = None
+
++def infer_model(model, testRatings, testNegatives, K, num_thread):
++    """
++    Evaluate the performance (Hit_Ratio, NDCG) of top-K recommendation
++    Return: score of each test rating.
++    """
++    global _model
++    global _testRatings
++    global _testNegatives
++    global _K
++    _model = model
++    _testRatings = testRatings
++    _testNegatives = testNegatives
++    _K = K
++
++    hits, ndcgs = [],[]
++    if(num_thread > 1): # Multi-thread
++        pool = multiprocessing.Pool(processes=num_thread)
++        res = pool.map(eval_one_rating, range(len(_testRatings)))
++        pool.close()
++        pool.join()
++        hits = [r[0] for r in res]
++        ndcgs = [r[1] for r in res]
++        return (hits, ndcgs)
++
++    # open file to overwrite
++    r = open("./movielens_ratings.txt", 'w')
++    # Single thread
++    for idx in range(len(_testRatings)):
++        (hr,ndcg) = infer_one_rating(idx, r)
++        hits.append(hr)
++        ndcgs.append(ndcg)
++    return (hits, ndcgs)
++def infer_one_rating(idx, r):
++    rating = _testRatings[idx]
++    items = _testNegatives[idx]
++    u = rating[0]
++    gtItem = rating[1]
++    items.append(gtItem)
++
++    # Get prediction scores
++    map_item_score = {}
++    users = np.full(len(items), u, dtype = 'int32')
++    predictions = _model.predict([users, np.array(items)],
++                                 batch_size=100, verbose=0)
++    for i in range(len(items)):
++        item = items[i]
++        map_item_score[item] = predictions[i]
++
++    # Evaluate top rank list
++    ranklist = heapq.nlargest(_K, map_item_score, key=map_item_score.get)
++
++    r.write("user : %s\n" % u)
++    r.write("items : %s\n" % items)
++    r.write("predicted_max_rating_item : %s\n" % ranklist[0])
++    r.write("predicted_max_rating_prob : %s\n" % map_item_score[ranklist[0]])
++    r.write("Top 10 Ratings:\n")
++    for i in range(len(ranklist)):
++        r.write("%s : %s\n" % (int(ranklist[i]), float(map_item_score[ranklist[i]])))
++    r.write("#########################################################\n")
++
++    hr = getHitRatio(ranklist, gtItem)
++    ndcg = getNDCG(ranklist, gtItem)
++    items.pop()
++    return (hr, ndcg)
++
+ def evaluate_model(model, testRatings, testNegatives, K, num_thread):
+     """
+     Evaluate the performance (Hit_Ratio, NDCG) of top-K recommendation
+@@ -44,7 +109,7 @@ def evaluate_model(model, testRatings, testNegatives, K, num_thread):
+         ndcgs = [r[1] for r in res]
+         return (hits, ndcgs)
+     # Single thread
+-    for idx in xrange(len(_testRatings)):
++    for idx in range(len(_testRatings)):
+         (hr,ndcg) = eval_one_rating(idx)
+         hits.append(hr)
+         ndcgs.append(ndcg)
+@@ -61,15 +126,15 @@ def eval_one_rating(idx):
+     users = np.full(len(items), u, dtype = 'int32')
+     predictions = _model.predict([users, np.array(items)],
+                                  batch_size=100, verbose=0)
+-    for i in xrange(len(items)):
++    for i in range(len(items)):
+         item = items[i]
+         map_item_score[item] = predictions[i]
+-    items.pop()
+
+     # Evaluate top rank list
+     ranklist = heapq.nlargest(_K, map_item_score, key=map_item_score.get)
+     hr = getHitRatio(ranklist, gtItem)
+     ndcg = getNDCG(ranklist, gtItem)
++    items.pop()
+     return (hr, ndcg)
+
+ def getHitRatio(ranklist, gtItem):
+@@ -79,7 +144,7 @@ def getHitRatio(ranklist, gtItem):
+     return 0
+
+ def getNDCG(ranklist, gtItem):
+-    for i in xrange(len(ranklist)):
++    for i in range(len(ranklist)):
+         item = ranklist[i]
+         if item == gtItem:
+             return math.log(2) / math.log(i+2)
diff --git a/samples/opensource/sampleNMT/CMakeLists.txt b/samples/opensource/sampleNMT/CMakeLists.txt
new file mode 100644
index 00000000..63248530
--- /dev/null
+++ b/samples/opensource/sampleNMT/CMakeLists.txt
@@ -0,0 +1,52 @@
+#
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+SET(SAMPLE_SOURCES
+    sampleNMT.cpp 
+    trtUtil.cpp
+) 
+include(../../CMakeSamplesTemplate.txt)
+
+# define SAMPLE_NMT_DATA_SOURCES and SAMPLE_NMT_MODEL_SOURCES
+set(SAMPLE_NMT_MODEL_SOURCES 
+    model/beamSearchPolicy.cpp
+    model/componentWeights.cpp
+    model/contextNMT.cpp
+    model/debugUtil.cpp
+    model/lstmDecoder.cpp
+    model/lstmEncoder.cpp
+    model/multiplicativeAlignment.cpp
+    model/slpAttention.cpp
+    model/slpEmbedder.cpp
+    model/slpProjection.cpp
+    model/softmaxLikelihood.cpp
+)
+
+set(SAMPLE_NMT_DATA_SOURCES
+    data/benchmarkWriter.cpp
+    data/bleuScoreWriter.cpp
+    data/dataWriter.cpp
+    data/limitedSamplesDataReader.cpp
+    data/textReader.cpp
+    data/textWriter.cpp
+    data/vocabulary.cpp
+)
+
+set(TARGET_NAME ${SAMPLE_NAME})
+target_sources(${TARGET_NAME}
+PRIVATE 
+    ${SAMPLE_NMT_MODEL_SOURCES}
+    ${SAMPLE_NMT_DATA_SOURCES}
+)
diff --git a/samples/opensource/sampleNMT/README.md b/samples/opensource/sampleNMT/README.md
new file mode 100644
index 00000000..44990836
--- /dev/null
+++ b/samples/opensource/sampleNMT/README.md
@@ -0,0 +1,181 @@
+# Neural Machine Translation (NMT) Using A Sequence To Sequence (seq2seq) Model
+
+
+**Table Of Contents**
+- [Description](#description)
+- [How does this sample work?](#how-does-this-sample-work)
+    * [Encoding and embedding](#encoding-and-embedding)
+    * [Attention mechanisms](#attention-mechanisms)
+    * [Beam search and projection](#beam-search-and-projection)
+	* [TensorRT API layers and ops](#tensorrt-api-layers-and-ops)
+- [Prerequisites](#prerequisites)
+- [Running the sample](#running-the-sample)
+	* [Sample `--help` options](#sample---help-options)
+- [Additional resources](#additional-resources)
+- [License](#license)
+- [Changelog](#changelog)
+- [Known issues](#known-issues)
+
+## Description
+
+This sample, sampleNMT, demonstrates the implementation of Neural Machine Translation (NMT) based on a TensorFlow seq2seq model using the TensorRT API. The TensorFlow seq2seq model is an open sourced NMT project that uses deep neural networks to translate text from one language to another language.
+
+Specifically, this sample is an end-to-end sample that takes a TensorFlow model, builds an engine, and runs inference using the generated network. The sample is intended to be modular so it can be used as a starting point for your machine translation application.
+
+This sample implements German to English translation using the data that is provided by and trained from the [TensorFlow NMT (seq2seq) Tutorial](https://github.com/tensorflow/nmt.git).
+
+Note: Please note that the sample supports Linux only. Windows users can use Windows Subsystem for Linux (WSL) to run sampleNMT.
+
+## How does this sample work?
+
+The basic architecture of the NMT model consists of two sides: an encoder and a decoder. Incoming sentences are translated into sequences of words in a fixed vocabulary. The incoming sequence goes through the **encoder** and is transformed by a network of Recurrent Neural Network (RNN) layers into an internal state space that represents a language-independent "meaning" of the sentence. The **decoder** works the opposite way, transforming from the internal state space back into a sequence of words in the output vocabulary.
+
+### Encoding and embedding
+
+The encoding process requires a fixed vocabulary of words from the source language. Words not appearing in the vocabulary are replaced with an `UNKNOWN` token. Special symbols also represent `START-OF-SENTENCE` and `END-OF-SENTENCE`. After the input is finished, a `START-OF-SENTENCE` is fed in to mark the switch to decoding. The decoder will then produce the `END-OF-SENTENCE` symbol to indicate it is finished translating.
+
+Vocabulary words are represented as word vectors of a fixed size. The mapping from vocabulary word to embedding vector is learned during training.
+
+### Attention mechanisms
+
+Attention mechanisms sit between the encoder and decoder and allow the network to focus on one part of the translation task at a time. It is possible to directly connect the encoding and decoding stages but this would mean the internal state representing the meaning of the sentence would have to cover sentences of all possible lengths at once.
+
+This sample implements [Luong attention](https://arxiv.org/abs/1508.04025). In this model, at each decoder step the target hidden state is combined with all source states using the attention weights. A scoring function weighs each contribution from the source states. The attention vector is then fed into the next decoder stage as an input.
+
+### Beam search and projection
+
+There are several ways to organize the decode stage. The output of the RNN layer is not a single word. The simplest method is to choose the most likely word at each time step, assume that is the correct output, and continue until the decoder generates the `END-OF-SENTENCE` symbol.
+
+A better way to perform the decoding is to keep track of multiple candidate possibilities in parallel and keep updating the possibilities with the most likely sequences. In practice, a small fixed size of candidates works well. This method is called beam search. The beam width is the number of simultaneous candidate sequences that are in consideration at each time step.
+
+As part of beam search we need a mechanism to convert output states into probability vectors over the vocabulary. This is accomplished with the projection layer using a fixed dense matrix.
+
+For more information related to sampleNMT, see [Creating A Network Definition In C++](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#network_c), [Working With Deep Learning Frameworks](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#build_model), and  [Enabling FP16 Inference Using C++](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#enable_fp16_c).
+
+### TensorRT API layers and ops
+
+In this sample, the following layers are used. For more information about these layers, see the [TensorRT Developer Guide: Layers](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#layers) documentation.
+
+[Constant layer](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#constant-layer)
+The Constant layer outputs a tensor with values provided as parameters to this layer, enabling the convenient use of constants in computations. As used in the `slp_attention.cpp`, `slp_embedder.cpp` and `slp_projection.cpp` files.
+
+[Gather layer](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#gather-layer)
+The Gather layer implements the `gather` operation on a given axis.  As used in the `slp_embedder.cpp` file.
+
+[MatrixMultiply layer](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#matrixmultiply-layer)
+The MatrixMultiply layer implements matrix multiplication for a collection of matrices.  As used in the `context.cpp`, `multiplicative_alignment.cpp`, `slp_attention.cpp` and `slp_projection.cpp` files.
+
+[RaggedSoftMax layer](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#raggedsoftmax-layer)
+The Ragged SoftMax layer applies the SoftMax function on an input tensor of sequences across the sequence lengths specified by the user.  As used in the `context.cpp` file.
+
+[RNNv2 layer](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#rnnv2-layer)
+The RNNv2 layer implements recurrent layers such as Recurrent Neural Network (RNN), Gated Recurrent Units (GRU), and Long Short-Term Memory (LSTM). It performs a recurrent operation, where the operation is defined by one of several well-known recurrent neural network (RNN) "cells".  As used in the `lstm_encoder.cpp` and `lstm_decoder.cpp` files.
+
+[Shuffle layer](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#shuffle-layer)
+The Shuffle layer implements a reshape and transpose operator for tensors.  As used in the `lstm_encoder.cpp` and `lstm_decoder.cpp` files.
+
+[TopK layer](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#topk-layer)
+The TopK layer finds the top K maximum (or minimum) elements along a dimension, returning a reduced tensor and a tensor of index positions.  As used in the `softmax_likelihood.cpp` file.
+
+## Prerequisites
+
+The model was trained on the [German to English (De-En) dataset](https://github.com/tensorflow/nmt#wmt-german-english) in the WMT database. Before you can run the sample, you need trained model weights and the text and vocabulary data for performing inference.
+
+Run the following command from the `<TensorRT root directory>`. This will download the pre-trained weights, a vocabulary file and an example input text file. In addition, it will preprocess the input text file so that sampleNMT can translate it. The following command prepares all necessary input data.
+`./samples/sampleNMT/get_newstest2015.sh`
+
+## Running the sample
+
+Now that you have trained weights, downloaded the text and vocabulary data, and compiled the sample you can run the sample.
+
+1.  Compile this sample by running `make` in the `<TensorRT root directory>/samples/sampleNMT` directory. The binary named `sample_nmt` will be created in the `<TensorRT root directory>/bin` directory.
+	```
+	cd <TensorRT root directory>/samples/sampleNMT
+	make
+	```
+
+	Where `<TensorRT root directory>` is where you installed TensorRT.
+
+2.  Run the sample to generate the example translation from German to English:
+	```
+	sample_nmt --data_writer=text
+	```
+
+	**Note:** If your data is not located in `<path_to_tensorrt>/data/samples/nmt/deen`, use the `--data_dir=<path_to_data_directory>` option. Where `<path_to_data_directory>` is the path to your data directory. For example:
+    ```
+    sample_nmt --data_dir=<path_to_data_directory> --data_writer=text
+    ```
+
+	The files in the `data` directory contain hardcoded names. Therefore, if you want to translate a different input file, rename the input file to `newstest2015.tok.bpe.32000.de` and put it in the data directory.
+
+	The translated output is located in the `./translation_output.txt` file.
+
+3.  Run the sample to get the BLEU score (the quality of the translated text) for the first 100 sentences:
+	```
+	sample_nmt --max_inference_samples=100 --data-writer=bleu
+	```
+
+4.  Verify your translated output.
+		a. Compare your translated output to the `<path_to_tensorrt>/data/newstest2015.tok.bpe.32000.en` translated output file in the TensorRT package.
+		b. Compare the quality of your translated output with the 25.85 BLEU score quality metric file in the TensorRT package.
+
+
+### Sample `--help` options
+
+To see the full list of available options and their descriptions, use the `-h` or `--help` command line option.  For example:
+```
+data_dir: /workspace/tensorrt/samples/sampleNMT/data/deen
+data_writer: text
+Component Info:
+– Data Reader: Text Reader, vocabulary size = 36548
+– Input Embedder: SLP Embedder, num inputs = 36548, num outputs = 512
+– Output Embedder: SLP Embedder, num inputs = 36548, num outputs = 512
+– Encoder: LSTM Encoder, num layers = 2, num units = 512
+– Decoder: LSTM Decoder, num layers = 2, num units = 512
+– Alignment: Multiplicative Alignment, source states size = 512, attention keys size = 512
+– Context: Ragged softmax + Batch GEMM
+– Attention: SLP Attention, num inputs = 1024, num outputs = 512
+– Projection: SLP Projection, num inputs = 512, num outputs = 36548
+– Likelihood: Softmax Likelihood
+– Search Policy: Beam Search Policy, beam = 5
+– Data Writer: Text Writer, vocabulary size = 36548
+End of Component Info
+```
+
+## Additional resources
+
+The following resources provide a deeper understanding about Neural Machine Translation and seq2seq models:
+
+**NMT**
+- [Luong, Cho, Manning, (2016)](https://sites.google.com/site/acl16nmt/)
+- [Luong, (2016)](https://github.com/lmthang/thesis)
+- [Neubig, (2017)](https://arxiv.org/abs/1703.01619)
+
+**Models**
+- [OpenNMT](http://opennmt.net/OpenNMT/)
+- [NMT (seq2seq) Tutorial](https://github.com/tensorflow/nmt)
+
+**Blogs**
+- [Neural Machine Translation Inference in TensorRT](https://devblogs.nvidia.com/neural-machine-translation-inference-tensorrt-4/)
+- [Introduction to NMT](https://devblogs.nvidia.com/introduction-neural-machine-translation-with-gpus/)
+
+**Videos**
+- [Optimizing NMT with TensorRT](http://on-demand.gputechconf.com/gtc/2018/video/S8822/)
+
+**Documentation**
+- [Introduction To NVIDIA’s TensorRT Samples](https://docs.nvidia.com/deeplearning/sdk/tensorrt-sample-support-guide/index.html#samples)
+- [Working With TensorRT Using The C++ API](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#c_topics)
+- [NVIDIA’s TensorRT Documentation Library](https://docs.nvidia.com/deeplearning/sdk/tensorrt-archived/index.html)
+
+## License
+
+For terms and conditions for use, reproduction, and distribution, see the [TensorRT Software License Agreement](https://docs.nvidia.com/deeplearning/sdk/tensorrt-sla/index.html) documentation.
+
+## Changelog
+
+June 2019
+This is the first release of the `README.md` file and sample.
+
+## Known issues
+
+If you would like to train your own weights through the TensorFlow implementation, you can use the `chptToBin.py` script to convert weights in a format that is readable by TensorRT. However, the `chptToBin.py` script may be outdated.
diff --git a/samples/opensource/sampleNMT/chptToBin.py b/samples/opensource/sampleNMT/chptToBin.py
new file mode 100644
index 00000000..fd4122c0
--- /dev/null
+++ b/samples/opensource/sampleNMT/chptToBin.py
@@ -0,0 +1,448 @@
+#
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import tensorflow as tf
+import numpy as np
+import os
+import sys
+import argparse
+from copy import deepcopy
+
+"""
+    The conversion of a checkpoint from 
+    https://github.com/tensorflow/nmt project 
+    The conversion was tested using Tensorflow 1.6
+"""
+
+def chpt_to_dict_arrays_simple(file_name):
+    """
+        Convert a checkpoint into into a dictionary of numpy arrays 
+        for later use in TensorRT NMT sample.
+    """
+    config = tf.ConfigProto(allow_soft_placement=True)
+    sess = tf.Session(config=config)
+
+    saver = tf.train.import_meta_graph(file_name)
+    dir_name = os.path.dirname(os.path.abspath(file_name))
+    saver.restore(sess, tf.train.latest_checkpoint(dir_name))
+
+    params = {}
+    print ('\nFound the following trainable variables:')
+    with sess.as_default():
+        variables = tf.trainable_variables()
+        for v in variables:
+            params[v.name] = v.eval(session=sess)
+            print ("{0}    {1}".format(v.name, params[v.name].shape))
+
+    #use default value
+    params["forget_bias"] = 1.0
+    return params
+
+def chpt_to_dict_arrays():
+    """
+        Convert a checkpoint into a dictionary of numpy arrays 
+        for later use in TensorRT NMT sample.
+        git clone https://github.com/tensorflow/nmt.git
+    """
+    sys.path.append('./nmt')
+    from nmt.nmt import add_arguments, create_hparams
+    from nmt import attention_model
+    from nmt import model_helper
+    from nmt.nmt import create_or_load_hparams
+    from nmt import utils
+    from nmt import model as nmt_model
+
+    nmt_parser = argparse.ArgumentParser()
+    add_arguments(nmt_parser)
+    FLAGS, unparsed = nmt_parser.parse_known_args()
+
+    default_hparams = create_hparams(FLAGS)
+
+    hparams = create_or_load_hparams(\
+        FLAGS.out_dir, default_hparams, FLAGS.hparams_path, save_hparams=False)
+
+    print (hparams)
+
+    model_creator = None
+    if not hparams.attention:
+        model_creator = nmt_model.Model
+    elif hparams.attention_architecture == "standard":
+        model_creator = attention_model.AttentionModel
+    else:
+        raise ValueError("Unknown model architecture")
+
+    infer_model = model_helper.create_infer_model(model_creator, hparams, scope = None)
+
+    params = {}
+    print ('\nFound the following trainable variables:')
+    with tf.Session(
+        graph=infer_model.graph, config=utils.misc_utils.get_config_proto()) as sess:
+
+        loaded_infer_model = model_helper.load_model(
+        infer_model.model, FLAGS.ckpt, sess, "infer")
+
+        variables = tf.trainable_variables()
+        for v in variables:
+            params[v.name] = v.eval(session=sess)
+            print ("{0}    {1}".format(v.name, params[v.name].shape))
+
+    params["forget_bias"] = hparams.forget_bias
+    return params
+
+def concatenate_layers(params):
+    """Concatenate weights from multiple layers"""
+
+    input_dict_size = params[u'embeddings/encoder/embedding_encoder:0'].shape[0]
+    output_dict_size = params[u'embeddings/decoder/embedding_decoder:0'].shape[0]
+    print('Input dictionary size: {0}, Output dictionary size: {1}'.format(input_dict_size, output_dict_size))
+
+    layers = 0
+    encoder_type = "unidirectional"
+    for key in params:
+        if "bidirectional_rnn" in key:
+            encoder_type = "bidirectional"
+        if "basic_lstm_cell" in key:
+            layers = layers + 1
+
+    # we have encoder, decoder, kernel and bias
+    layers = int(layers / 4)
+    print('Layers: {0}, Encoder type: {1}'.format(layers, encoder_type))
+
+    data = {}
+    encoder_postfix = u'/basic_lstm_cell/'
+    kernel_alias = u'kernel:0'
+    bias_alias = u'bias:0'
+    # weights, concatenate all layers
+    #process encoder
+    if encoder_type == 'bidirectional':
+        bi_layers = int(layers / 2)
+        if bi_layers == 1:
+            bifw_encoder_prefix = u'dynamic_seq2seq/encoder/bidirectional_rnn/fw/basic_lstm_cell/'
+            bibw_encoder_prefix = u'dynamic_seq2seq/encoder/bidirectional_rnn/bw/basic_lstm_cell/'
+            data["encrnnkernel"] = params[bifw_encoder_prefix + kernel_alias] 
+            tmp_weights = params[bibw_encoder_prefix + kernel_alias]
+            data["encrnnkernel"] = np.concatenate((data["encrnnkernel"], tmp_weights), axis=0)
+
+            data["encrnnbias"] = params[bifw_encoder_prefix + bias_alias]
+            tmp_weights = params[bibw_encoder_prefix + bias_alias]
+            data["encrnnbias"] = np.concatenate((data["encrnnbias"], tmp_weights), axis=0)
+        else:
+            bifw_encoder_prefix = u'dynamic_seq2seq/encoder/bidirectional_rnn/fw/multi_rnn_cell/cell_'
+            bibw_encoder_prefix = u'dynamic_seq2seq/encoder/bidirectional_rnn/bw/multi_rnn_cell/cell_'
+
+            data["encrnnkernel"] = np.concatenate(tuple(params[bifw_encoder_prefix + str(i) \
+                                                    + encoder_postfix + kernel_alias] \
+                                                    for i in range(bi_layers)), axis=0)
+            tmp_weights = np.concatenate(tuple(params[bibw_encoder_prefix + str(i) \
+                                                + encoder_postfix + kernel_alias] \
+                                                for i in range(bi_layers)), axis=0)
+            data["encrnnkernel"] = np.concatenate((data["encrnnkernel"], tmp_weights), axis=0)
+
+            data["encrnnbias"] = np.concatenate(tuple(params[bifw_encoder_prefix + str(i) \
+                                                + encoder_postfix + bias_alias] \
+                                                for i in range(bi_layers)), axis=0)
+            tmp_weights = np.concatenate(tuple(params[bibw_encoder_prefix + str(i) \
+                                        + encoder_postfix + bias_alias] \
+                                        for i in range(bi_layers)), axis=0)
+            data["encrnnbias"] = np.concatenate((data["encrnnbias"], tmp_weights), axis=0)
+    else:
+        uni_encoder_prefix = u'dynamic_seq2seq/encoder/rnn/multi_rnn_cell/cell_'
+        data["encrnnkernel"] = np.concatenate(tuple(params[uni_encoder_prefix + str(i) \
+                                                + encoder_postfix + kernel_alias] \
+                                                for i in range(layers)), axis=0)
+        data["encrnnbias"] = np.concatenate(tuple(params[uni_encoder_prefix + str(i) \
+                                            + encoder_postfix + bias_alias] \
+                                            for i in range(layers)), axis=0)
+
+    data["encembed"] = params[u'embeddings/encoder/embedding_encoder:0']
+
+    #process decoder
+    data["decembed"] = params[u'embeddings/decoder/embedding_decoder:0']
+    data["decmemkernel"] = params[u'dynamic_seq2seq/decoder/memory_layer/kernel:0']
+    data["decattkernel"] = params[u'dynamic_seq2seq/decoder/attention/attention_layer/kernel:0']
+    data["decprojkernel"] = params[u'dynamic_seq2seq/decoder/output_projection/kernel:0']
+
+    uni_decoder_prefix = u'dynamic_seq2seq/decoder/attention/multi_rnn_cell/cell_'
+    data["decrnnkernel"] = np.concatenate(tuple(params[uni_decoder_prefix + str(i) \
+                                            + encoder_postfix + kernel_alias] \
+                                            for i in range(layers)), axis=0)
+    data["decrnnbias"] = np.concatenate(tuple(params[uni_decoder_prefix + str(i) \
+                                        + encoder_postfix + bias_alias] \
+                                        for i in range(layers)), axis=0)
+
+    for key in data:
+        print("{0} shape: {1}".format(key, data[key].shape))
+
+    num_units = int(data["decrnnkernel"].shape[1] / 4)
+    encoder_type_int = 1 if encoder_type == 'bidirectional' else 0
+    dimensions = {"layers": layers, 
+                  "encoder_type": encoder_type_int, 
+                  "num_units":  num_units,
+                  "encembed_outputs": data['encembed'].shape[0],
+                  "decembed_outputs": data['decembed'].shape[0],
+                  }
+    return dimensions, data
+
+def convert_rnn_kernel(weights, dimensions, is_decoder_rnn = False):
+    """ 
+    In place. weights conversion
+    TensorFlow weight parameters for BasicLSTMCell
+    are formatted as:
+    Each [WR][icfo] is hiddenSize sequential elements.
+    CellN  Row 0: WiT, WcT, WfT, WoT
+    CellN  Row 1: WiT, WcT, WfT, WoT
+    ...
+    CellN RowM-1: WiT, WcT, WfT, WoT
+    CellN RowM+0: RiT, RcT, RfT, RoT
+    CellN RowM+1: RiT, RcT, RfT, RoT
+    ...
+    CellNRow(M+P)-1: RiT, RcT, RfT, RoT
+    M - data size
+    P - projection size
+    TensorRT expects the format to laid out in memory:
+    CellN: Wf, Wi, Wc, Wo, Rf, Ri, Rc, Ro
+
+    For the purpose of implementing LSTMP all W and R weights become weights from W
+    CellN: Wf, Rf, Wi, Ri, Wc, Rc, Wo, Ro, Empty states
+
+    Update: alternative notation
+    Tensorflow documents gates' order in e.g. 
+    https:github.com/tensorflow/tensorflow/blob/r1.4/tensorflow/python/ops/rnn_cell_impl.py:439
+    TF: i = input_gate, j = new_input (cell gate), f = forget_gate, o = output_gate - ijfo
+    Need to convert 'ijfo' to 'fijo'
+    """
+
+    print("Starting shape: {0}".format(weights.shape))
+
+    num_units = dimensions["num_units"]
+    layers = dimensions["layers"]
+
+
+    new_weights = np.empty([0], dtype=np.float32)
+    # if is_decoder_rnn == False:
+    if False :
+        # we can use decoder path for both, but we leave it for now
+        input_size = num_units
+        # case encoder
+        # (layers * 2 * input_size, 4 * num_units) -> (layers, 2, input_size, 4, num_units))
+        weights = np.reshape(weights, (layers, 2, input_size, 4, num_units))
+        print("After reshape: {0}".format(weights.shape))
+
+        # reorder/transpose axis to match TensorRT format (layers, 2, 4, num_units, input_size) 
+        weights = np.moveaxis(weights, [2, 3, 4], [4, 2, 3])
+        print("After moveaxis: {0}".format(weights.shape))
+
+        # then we reorder gates from Tensorflow's 'icfo' into TensorRT's 'fico' order
+        input_perm = [ 1, 2, 0, 3 ]
+        temp_weights = np.empty([layers, 2, 4, num_units, input_size], dtype=np.float32)
+        for i in range(4):
+            temp_weights[:, :, input_perm[i], :, :] = weights[:, :, i, :, :]
+
+        weights = deepcopy(temp_weights)
+    else:
+        offset = 0
+        for i in range(layers):
+            # first layer has shape (input_size + num_units, 4 * num_units)
+            # other layers  (num_units + num_units, 4 * num_units)
+            input_size = 2 * num_units if i == 0 and is_decoder_rnn else num_units
+            temp_weights_w = np.empty([4, num_units, input_size], dtype=np.float32)
+            temp_weights_r = np.empty([4, num_units, num_units], dtype=np.float32)
+
+            layer_weights_w = np.reshape(weights[offset:(offset + input_size), :], (input_size, 4, num_units))
+            layer_weights_r = np.reshape(weights[(offset + input_size):(offset + input_size + num_units), :], (num_units, 4, num_units))
+
+            # reorder/transpose axis to match TensorRT format (layers, 2, 4, num_units, input_size)
+            layer_weights_w = np.moveaxis(layer_weights_w, [0, 1, 2], [2, 0, 1])
+            layer_weights_r = np.moveaxis(layer_weights_r, [0, 1, 2], [2, 0, 1])
+
+            # then we reorder gates from Tensorflow's 'icfo' into TensorRT's 'fico' order
+            input_perm = [ 1, 2, 0, 3 ]
+            for i in range(4):
+                temp_weights_w[input_perm[i], :, :] = layer_weights_w[i, :, :]
+                temp_weights_r[input_perm[i], :, :] = layer_weights_r[i, :, :]
+
+            layer_weights_w = deepcopy(temp_weights_w.flatten())
+            layer_weights_r = deepcopy(temp_weights_r.flatten())
+            new_weights = np.concatenate((new_weights, layer_weights_w, layer_weights_r), axis = 0)
+
+            offset = offset + input_size + num_units
+
+    return new_weights
+
+def convert_rnn_bias(weights, dimensions, forget_bias = 1.0):
+    """
+    TensorFlow bias parameters for BasicLSTMCell
+    are formatted as:
+    CellN: Bi, Bc, Bf, Bo
+
+    TensorRT expects the format to be:
+    CellN: Wf, Wi, Wc, Wo, Rf, Ri, Rc, Ro
+
+    Since Tensorflow already combines U and W,
+    we double the size and set all of U to zero.
+    """
+    num_units = dimensions["num_units"]
+    layers = dimensions["layers"]
+    temp_weights = np.zeros([layers, 2 * 4, num_units], dtype=np.float32)
+    weights = np.reshape(weights, (layers, 4, num_units))
+
+    # then we reorder gates from Tensorflow's 'icfo' into TensorRT's 'fico' order
+    input_perm = [ 1, 2, 0, 3 ]
+    for i in range(4):
+        temp_weights[:, input_perm[i], :] = weights[:, i, :]
+    #  Add a value to f bias to be consistent with the Tensorflow model.
+    print("Adding {0} to forget bias".format(forget_bias))
+    temp_weights[:, 0, :] = np.add(temp_weights[:, 0, :], forget_bias)
+    weights = deepcopy(temp_weights)
+
+    return weights
+
+
+def convert_weigts(dimensions, data, forget_bias = 1.0):
+    """Convert weights from Tensorflow to TensorRT format"""
+  
+    print("Processing encoder RNN kernel") 
+    data["encrnnkernel"] = convert_rnn_kernel(data["encrnnkernel"], dimensions, False)
+    
+    print("Processing encoder RNN bias")
+    data["encrnnbias"] = convert_rnn_bias(data["encrnnbias"], dimensions, forget_bias = forget_bias)
+
+    print("Processing decoder RNN kernel") 
+    data["decrnnkernel"] = convert_rnn_kernel(data["decrnnkernel"], dimensions, True)
+
+    print("Processing decoder RNN bias")
+    data["decrnnbias"] = convert_rnn_bias(data["decrnnbias"], dimensions, forget_bias = forget_bias)
+    
+    return data
+
+def save_layer_weights(data, list_keys, dims, footer_string, file_name):
+    """
+        data          - dictionary with string names as keys and 
+                        numpy weights as values
+        list_keys     - list of dictionary keys to save
+        dims          - list of int values relevant to the layer
+                        e.g. tensor dimensions sufficient to extract all the tensors
+        footer_string - marker placed at the end of file
+
+        file format: data -> meta_data -> footer
+    """
+
+    data_type = data[list_keys[0]].dtype
+    #default precision is FP32
+    # The values should be compartible with DataType from Nvinfer.h
+    data_prec = 1 if data_type == np.dtype('float16') else 0
+
+    meta_data  = np.int32([data_prec] + dims)
+    meta_count = np.int32(meta_data.shape[0])
+
+    out_file = open(file_name, 'wb')
+    for key in list_keys:
+        out_file.write(data[key].tobytes())
+    out_file.write(meta_data.tobytes())
+    # write footer
+    out_file.write(meta_count.tobytes() + bytearray(footer_string, 'ASCII'))
+
+def main(_):
+
+    if len(sys.argv) < 3:
+        print ('\nUsage:')
+        print ('python {0} <NMT inference parameters> --weightsdir=<case_name_dir>'.format(sys.argv[0]))
+        print ("""e.g. \npython {0} --src=en --tgt=vi \\
+    --ckpt=/path/to/envi_model/translate.ckpt \\
+    --hparams_path=nmt/standard_hparams/iwslt15.json \\ 
+    --out_dir=/tmp/envi \\
+    --vocab_prefix=/tmp/nmt_data/vocab \\
+    --inference_input_file=/tmp/nmt_data/tst2013.en \\
+    --inference_output_file=/tmp/envi/output_infer \\
+    --inference_ref_file=/tmp/nmt_data/tst2013.vi \\
+    --weightsdir=envi""".format(sys.argv[0]))
+        print ('\nOR\n')
+        print ('python {0} --metafile=</path_to/graph.meta> --weightsdir=<case_name_dir> '.format(sys.argv[0]))
+        print ('e.g.\npython {0} --metafile=./translate.ckpt-12000.meta --weightsdir=envi'.format(sys.argv[0]))
+        sys.exit()
+
+    nmt_parser = argparse.ArgumentParser()
+    nmt_parser.add_argument("--metafile", type=str, default=None,
+                      help="Path to the metafile (alternative checkpoint restore, may not work)")
+    nmt_parser.add_argument("--weightsdir", type=str, default="weights",
+                     help="Output weights directory")
+    trt_flags, unparsed = nmt_parser.parse_known_args()
+
+    if trt_flags.metafile == None:
+        params = chpt_to_dict_arrays()
+    else:
+        params = chpt_to_dict_arrays(trt_flags.metafile)
+
+    print('\nLoading the checkpoint...\n')
+    
+    print('\nConcatenating the weights...')
+    dimensions, data = concatenate_layers(params)
+
+    print('\nConverting the weights...')
+    # Convert weights to TensorRT format
+    data = convert_weigts(dimensions, data, params["forget_bias"])
+
+    print('\nSaving into binary file...')
+
+    case_dir = trt_flags.weightsdir
+    if not os.path.isdir(case_dir):
+        os.mkdir(case_dir)
+    case_dir = case_dir + "/"
+
+    trt_string = u'trtsamplenmt'
+    # save embed weights
+    save_layer_weights(data, ["encembed"], \
+                        [ dimensions["encembed_outputs"], \
+                          dimensions["num_units"] ], \
+                        trt_string, case_dir + "encembed.bin")
+    save_layer_weights(data, ["decembed"], \
+                        [ dimensions["decembed_outputs"], \
+                        dimensions["num_units"] ], \
+                        trt_string, case_dir + "decembed.bin")
+    #encrnn
+    save_layer_weights(data, ["encrnnkernel", "encrnnbias"], \
+                        [ dimensions["encoder_type"], \
+                        dimensions["layers"], \
+                        dimensions["num_units"] ], \
+                        trt_string, case_dir + "encrnn.bin")
+    #decrnn
+    save_layer_weights(data, ["decrnnkernel", "decrnnbias"], \
+                        [ 0, \
+                        dimensions["layers"], \
+                        dimensions["num_units"] ], \
+                        trt_string, case_dir + "decrnn.bin")
+    #decprojkernel
+    save_layer_weights(data, ["decprojkernel"], \
+                        [ dimensions["num_units"], \
+                        dimensions["decembed_outputs"] ], \
+                        trt_string, case_dir + "decproj.bin")
+
+    #decmemkernel
+    save_layer_weights(data, ["decmemkernel"], \
+                        [ dimensions["num_units"], \
+                        dimensions["num_units"] ], \
+                        trt_string, case_dir + "decmem.bin")
+                        
+    #decattkernel
+    # first dimension is 3 * num_units of bi RNN, 2 * num_units otherwise
+    save_layer_weights(data, ["decattkernel"], \
+                        [ data["decattkernel"].shape[0], \
+                        dimensions["num_units"] ], \
+                        trt_string, case_dir + "decatt.bin")
+
+
+if __name__ == "__main__":
+    tf.app.run()
diff --git a/samples/opensource/sampleNMT/component.h b/samples/opensource/sampleNMT/component.h
new file mode 100644
index 00000000..9222a538
--- /dev/null
+++ b/samples/opensource/sampleNMT/component.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SAMPLE_NMT_COMPONENT_
+#define SAMPLE_NMT_COMPONENT_
+
+#include <memory>
+#include <string>
+
+namespace nmtSample
+{
+/** \class Component
+ *
+ * \brief a functional part of the sample
+ *
+ */
+class Component
+{
+public:
+    typedef std::shared_ptr<Component> ptr;
+
+    /**
+     * \brief get the textual description of the component
+     */
+    virtual std::string getInfo() = 0;
+
+protected:
+    Component() = default;
+
+    virtual ~Component() = default;
+};
+} // namespace nmtSample
+
+#endif // SAMPLE_NMT_COMPONENT_
diff --git a/samples/opensource/sampleNMT/cudaError.h b/samples/opensource/sampleNMT/cudaError.h
new file mode 100644
index 00000000..99aee487
--- /dev/null
+++ b/samples/opensource/sampleNMT/cudaError.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SAMPLE_NMT_CUDA_ERROR_
+#define SAMPLE_NMT_CUDA_ERROR_
+
+#include <cassert>
+#include <cuda_runtime.h>
+#include <iostream>
+
+#define CUDA_CHECK(callstr)                                                                                            \
+    {                                                                                                                  \
+        cudaError_t error_code = callstr;                                                                              \
+        if (error_code != cudaSuccess)                                                                                 \
+        {                                                                                                              \
+            std::cerr << "CUDA error " << error_code << ": \"" << cudaGetErrorString(error_code) << "\" at "           \
+                      << __FILE__ << ":" << __LINE__ << std::endl;                                                     \
+            assert(0);                                                                                                 \
+        }                                                                                                              \
+    }
+
+#endif // SAMPLE_NMT_CUDA_ERROR_
diff --git a/samples/opensource/sampleNMT/data/benchmarkWriter.cpp b/samples/opensource/sampleNMT/data/benchmarkWriter.cpp
new file mode 100644
index 00000000..463324e4
--- /dev/null
+++ b/samples/opensource/sampleNMT/data/benchmarkWriter.cpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "benchmarkWriter.h"
+#include "logger.h"
+
+#include <iostream>
+
+namespace nmtSample
+{
+BenchmarkWriter::BenchmarkWriter()
+    : mSampleCount(0)
+    , mInputTokenCount(0)
+    , mOutputTokenCount(0)
+    , mStartTS(std::chrono::high_resolution_clock::now())
+{
+}
+
+void BenchmarkWriter::write(const int* hOutputData, int actualOutputSequenceLength, int actualInputSequenceLength)
+{
+    ++mSampleCount;
+    mInputTokenCount += actualInputSequenceLength;
+    mOutputTokenCount += actualOutputSequenceLength;
+}
+
+void BenchmarkWriter::initialize()
+{
+    mStartTS = std::chrono::high_resolution_clock::now();
+}
+
+void BenchmarkWriter::finalize()
+{
+    std::chrono::duration<float> sec = std::chrono::high_resolution_clock::now() - mStartTS;
+    int totalTokenCount = mInputTokenCount + mOutputTokenCount;
+    gLogInfo << mSampleCount << " sequences generated in " << sec.count() << " seconds, "
+             << (mSampleCount / sec.count()) << " samples/sec" << std::endl;
+    gLogInfo << totalTokenCount << " tokens processed (source and destination), " << (totalTokenCount / sec.count())
+             << " tokens/sec" << std::endl;
+}
+
+std::string BenchmarkWriter::getInfo()
+{
+    return "Benchmark Writer";
+}
+} // namespace nmtSample
diff --git a/samples/opensource/sampleNMT/data/benchmarkWriter.h b/samples/opensource/sampleNMT/data/benchmarkWriter.h
new file mode 100644
index 00000000..baa01d91
--- /dev/null
+++ b/samples/opensource/sampleNMT/data/benchmarkWriter.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SAMPLE_NMT_BENCHMARK_WRITER_
+#define SAMPLE_NMT_BENCHMARK_WRITER_
+
+#include <chrono>
+#include <memory>
+
+#include "dataWriter.h"
+
+namespace nmtSample
+{
+/** \class BenchmarkWriter
+ *
+ * \brief all it does is to measure the performance of sequence generation
+ *
+ */
+class BenchmarkWriter : public DataWriter
+{
+public:
+    BenchmarkWriter();
+
+    void write(const int* hOutputData, int actualOutputSequenceLength, int actualInputSequenceLength) override;
+
+    void initialize() override;
+
+    void finalize() override;
+
+    std::string getInfo() override;
+
+    ~BenchmarkWriter() override = default;
+
+private:
+    int mSampleCount;
+    int mInputTokenCount;
+    int mOutputTokenCount;
+    std::chrono::high_resolution_clock::time_point mStartTS;
+};
+} // namespace nmtSample
+
+#endif // SAMPLE_NMT_BENCHMARK_WRITER_
diff --git a/samples/opensource/sampleNMT/data/bleuScoreWriter.cpp b/samples/opensource/sampleNMT/data/bleuScoreWriter.cpp
new file mode 100644
index 00000000..da816725
--- /dev/null
+++ b/samples/opensource/sampleNMT/data/bleuScoreWriter.cpp
@@ -0,0 +1,230 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "bleuScoreWriter.h"
+#include "logger.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <iostream>
+#include <map>
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace nmtSample
+{
+
+typedef std::vector<std::string> Segment_t;
+typedef std::map<Segment_t, int> Count_t;
+int read(std::vector<Segment_t>& samples, std::shared_ptr<std::istream> input, int samplesToRead = 1)
+{
+    std::string line;
+    int lineCounter = 0;
+    Segment_t tokens;
+    samples.resize(0);
+    std::string pattern("@@ ");
+    while (lineCounter < samplesToRead && std::getline(*input, line))
+    {
+        // if clean and handle BPE or SPM outputs is required
+        std::size_t p0 = 0;
+        while ((p0 = line.find(pattern, p0)) != std::string::npos)
+        {
+            line.replace(p0, pattern.length(), "");
+        }
+
+        // generate error if those special characters exist. Windows needs explicit encoding.
+#ifdef _MSC_VER
+        p0 = line.find(u8"\u2581");
+#else
+        p0 = line.find("\u2581");
+#endif
+        assert((p0 == std::string::npos));
+        std::istringstream ss(line);
+        std::string token;
+        tokens.resize(0);
+        while (ss >> token)
+        {
+            tokens.emplace_back(token);
+        }
+        samples.emplace_back(tokens);
+        lineCounter++;
+    }
+    return lineCounter;
+}
+
+Count_t ngramCounts(const Segment_t& segment, int maxOrder = 4)
+{
+    Count_t ngramCounts;
+
+    for (int order = 1; order < maxOrder + 1; order++)
+    {
+        for (int i = 0; i < static_cast<int>(segment.size()) - order + 1; i++)
+        {
+            Segment_t ngram;
+            for (int j = i; j < i + order; j++)
+                ngram.emplace_back(segment[j]);
+
+            auto it = ngramCounts.find(ngram);
+            if (it != ngramCounts.end())
+            {
+                it->second++;
+            }
+            else
+                ngramCounts[ngram] = 1;
+        }
+    }
+
+    return ngramCounts;
+}
+
+Count_t ngramCountIntersection(const Count_t& cnt0, const Count_t& cnt1)
+{
+    Count_t overlap;
+    // merge the maps
+    auto it0 = cnt0.begin(), it1 = cnt1.begin(), end0 = cnt0.end(), end1 = cnt1.end();
+    while (it0 != end0 && it1 != end1)
+    {
+        if (it0->first == it1->first)
+        {
+            overlap.emplace(it0->first, std::min(it0->second, it1->second));
+            it0++;
+            it1++;
+        }
+        else
+        {
+            if (it0->first < it1->first)
+                it0++;
+            else
+                it1++;
+        }
+    }
+    return overlap;
+}
+
+void accumulateBLEU(const std::vector<Segment_t>& referenceSamples, const std::vector<Segment_t>& outputSamples,
+    int maxOrder, size_t& referenceLength, size_t& translationLength, std::vector<size_t>& matchesByOrder,
+    std::vector<size_t>& possibleMatchesByOrder)
+{
+    assert(referenceSamples.size() == outputSamples.size());
+    auto reference = referenceSamples.begin();
+    auto translation = outputSamples.begin();
+
+    while (translation != outputSamples.end())
+    {
+        referenceLength += reference->size();
+        translationLength += translation->size();
+
+        Count_t refNgramCounts = ngramCounts(*reference);
+        Count_t outputNgramCounts = ngramCounts(*translation);
+        Count_t overlap = ngramCountIntersection(outputNgramCounts, refNgramCounts);
+        for (auto& ngram : overlap)
+        {
+            matchesByOrder[ngram.first.size() - 1] += ngram.second;
+        }
+        for (int order = 1; order < maxOrder + 1; order++)
+        {
+            int possibleMatches = static_cast<int>(translation->size()) - order + 1;
+            if (possibleMatches > 0)
+                possibleMatchesByOrder[order - 1] += possibleMatches;
+        }
+        ++translation;
+        ++reference;
+    }
+}
+
+BLEUScoreWriter::BLEUScoreWriter(
+    std::shared_ptr<std::istream> referenceTextInput, Vocabulary::ptr vocabulary, int maxOrder)
+    : mReferenceInput(referenceTextInput)
+    , mVocabulary(vocabulary)
+    , mReferenceLength(0)
+    , mTranslationLength(0)
+    , mMaxOrder(maxOrder)
+    , mSmooth(false)
+    , mMatchesByOrder(maxOrder, 0)
+    , mPossibleMatchesByOrder(maxOrder, 0)
+{
+}
+
+void BLEUScoreWriter::write(const int* hOutputData, int actualOutputSequenceLength, int actualInputSequenceLength)
+{
+    std::vector<Segment_t> outputSamples;
+    std::vector<Segment_t> referenceSamples;
+    int numReferenceSamples = read(referenceSamples, mReferenceInput, 1);
+    assert(numReferenceSamples == 1);
+
+    Segment_t segment;
+    std::stringstream filteredSentence(DataWriter::generateText(actualOutputSequenceLength, hOutputData, mVocabulary));
+    std::string token;
+    while (filteredSentence >> token)
+    {
+        segment.emplace_back(token);
+    }
+    outputSamples.emplace_back(segment);
+
+    accumulateBLEU(referenceSamples, outputSamples, mMaxOrder, mReferenceLength, mTranslationLength, mMatchesByOrder,
+        mPossibleMatchesByOrder);
+}
+
+void BLEUScoreWriter::initialize() {}
+
+void BLEUScoreWriter::finalize()
+{
+    gLogInfo << "BLEU score = " << getScore() << std::endl;
+}
+
+float BLEUScoreWriter::getScore() const
+{
+    std::vector<double> precisions(mMaxOrder, 0.0);
+    for (int i = 0; i < mMaxOrder; i++)
+    {
+        if (mSmooth)
+        {
+            precisions[i] = ((mMatchesByOrder[i] + 1.) / (mPossibleMatchesByOrder[i] + 1.));
+        }
+        else
+        {
+            if (mPossibleMatchesByOrder[i] > 0)
+                precisions[i] = (static_cast<double>(mMatchesByOrder[i]) / mPossibleMatchesByOrder[i]);
+            else
+                precisions[i] = 0.0;
+        }
+    }
+    double pLogSum, geoMean;
+    if (*std::min_element(precisions.begin(), precisions.end()) > 0.0)
+    {
+        pLogSum = 0.0;
+        for (auto p : precisions)
+            pLogSum += (1. / mMaxOrder) * log(p);
+        geoMean = exp(pLogSum);
+    }
+    else
+        geoMean = 0.0;
+
+    double ratio = static_cast<double>(mTranslationLength) / mReferenceLength;
+    double bp;
+    bp = (ratio > 1.0) ? 1.0 : exp(1.0 - 1.0 / ratio);
+    return static_cast<float>(geoMean * bp * 100.0);
+}
+
+std::string BLEUScoreWriter::getInfo()
+{
+    std::stringstream ss;
+    ss << "BLEU Score Writer, max order = " << mMaxOrder;
+    return ss.str();
+}
+} // namespace nmtSample
diff --git a/samples/opensource/sampleNMT/data/bleuScoreWriter.h b/samples/opensource/sampleNMT/data/bleuScoreWriter.h
new file mode 100644
index 00000000..954d2107
--- /dev/null
+++ b/samples/opensource/sampleNMT/data/bleuScoreWriter.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SAMPLE_NMT_BLEU_SCORE_WRITER_
+#define SAMPLE_NMT_BLEU_SCORE_WRITER_
+
+#include <istream>
+#include <memory>
+#include <vector>
+
+#include "dataWriter.h"
+#include "vocabulary.h"
+
+namespace nmtSample
+{
+/** \class BLEUScoreWriter
+ *
+ * \brief all it does is to evaluate BLEU score
+ *
+ */
+class BLEUScoreWriter : public DataWriter
+{
+public:
+    BLEUScoreWriter(std::shared_ptr<std::istream> referenceTextInput, Vocabulary::ptr vocabulary, int maxOrder = 4);
+
+    void write(const int* hOutputData, int actualOutputSequenceLength, int actualInputSequenceLength) override;
+
+    void initialize() override;
+
+    void finalize() override;
+
+    std::string getInfo() override;
+
+    float getScore() const;
+
+    ~BLEUScoreWriter() override = default;
+
+private:
+    std::shared_ptr<std::istream> mReferenceInput;
+    Vocabulary::ptr mVocabulary;
+    size_t mReferenceLength;
+    size_t mTranslationLength;
+    int mMaxOrder;
+    bool mSmooth;
+    std::vector<size_t> mMatchesByOrder;
+    std::vector<size_t> mPossibleMatchesByOrder;
+};
+} // namespace nmtSample
+
+#endif // SAMPLE_NMT_BLEU_SCORE_WRITER_
diff --git a/samples/opensource/sampleNMT/data/dataReader.h b/samples/opensource/sampleNMT/data/dataReader.h
new file mode 100644
index 00000000..d5293d02
--- /dev/null
+++ b/samples/opensource/sampleNMT/data/dataReader.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SAMPLE_NMT_DATA_READER_
+#define SAMPLE_NMT_DATA_READER_
+
+#include <memory>
+
+#include "../component.h"
+
+namespace nmtSample
+{
+/** \class DataReader
+ *
+ * \brief reader of sequences of data
+ *
+ */
+class DataReader : public Component
+{
+public:
+    typedef std::shared_ptr<DataReader> ptr;
+
+    DataReader() = default;
+
+    /**
+     * \brief reads the batch of smaples/sequences
+     *
+     * \return the actual number of samples read
+     */
+    virtual int read(int samplesToRead, int maxInputSequenceLength, int* hInputData, int* hActualInputSequenceLengths)
+        = 0;
+
+    /**
+     * \brief Reset the reader position, the data reader is ready to read the data from th ebeginning again after the
+     * function returns
+     */
+    virtual void reset() = 0;
+
+    ~DataReader() override = default;
+};
+} // namespace nmtSample
+
+#endif // SAMPLE_NMT_DATA_READER_
diff --git a/samples/opensource/sampleNMT/data/dataWriter.cpp b/samples/opensource/sampleNMT/data/dataWriter.cpp
new file mode 100644
index 00000000..0a74e2a4
--- /dev/null
+++ b/samples/opensource/sampleNMT/data/dataWriter.cpp
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <sstream>
+
+#include "dataWriter.h"
+
+namespace nmtSample
+{
+std::string DataWriter::generateText(int sequenceLength, const int* currentOutputData, Vocabulary::ptr vocabulary)
+{
+    // if clean and handle BPE outputs is required
+    std::string delimiter = "@@";
+    size_t delimiterSize = delimiter.size();
+    std::stringstream sentence;
+    std::string word("");
+    const char* wordDelimiter = "";
+    for (int i = 0; i < sequenceLength; ++i)
+    {
+        int id = currentOutputData[i];
+        if (id != vocabulary->getEndSequenceId())
+        {
+            std::string token = vocabulary->getToken(id);
+            if ((token.size() >= delimiterSize)
+                && (token.compare(token.size() - delimiterSize, delimiterSize, delimiter) == 0))
+            {
+                word = word + token.erase(token.size() - delimiterSize, delimiterSize);
+            }
+            else
+            {
+                word = word + token;
+                sentence << wordDelimiter;
+                sentence << word;
+                word = "";
+                wordDelimiter = " ";
+            }
+        }
+    }
+    return sentence.str();
+}
+} // namespace nmtSample
\ No newline at end of file
diff --git a/samples/opensource/sampleNMT/data/dataWriter.h b/samples/opensource/sampleNMT/data/dataWriter.h
new file mode 100644
index 00000000..09bae426
--- /dev/null
+++ b/samples/opensource/sampleNMT/data/dataWriter.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SAMPLE_NMT_DATA_WRITER_
+#define SAMPLE_NMT_DATA_WRITER_
+
+#include <memory>
+#include <string>
+
+#include "../component.h"
+#include "vocabulary.h"
+
+namespace nmtSample
+{
+/** \class DataWriter
+ *
+ * \brief writer of sequences of data
+ *
+ */
+class DataWriter : public Component
+{
+public:
+    typedef std::shared_ptr<DataWriter> ptr;
+
+    DataWriter() = default;
+
+    /**
+     * \brief write the generated sequence
+     */
+    virtual void write(const int* hOutputData, int actualOutputSequenceLength, int actualInputSequenceLength) = 0;
+
+    /**
+     * \brief it is called right before inference starts
+     */
+    virtual void initialize() = 0;
+
+    /**
+     * \brief it is called right after inference ends
+     */
+    virtual void finalize() = 0;
+
+    ~DataWriter() override = default;
+
+protected:
+    static std::string generateText(int sequenceLength, const int* currentOutputData, Vocabulary::ptr vocabulary);
+};
+} // namespace nmtSample
+
+#endif // SAMPLE_NMT_DATA_WRITER_
diff --git a/samples/opensource/sampleNMT/data/limitedSamplesDataReader.cpp b/samples/opensource/sampleNMT/data/limitedSamplesDataReader.cpp
new file mode 100644
index 00000000..d0fd58f1
--- /dev/null
+++ b/samples/opensource/sampleNMT/data/limitedSamplesDataReader.cpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "limitedSamplesDataReader.h"
+
+#include <algorithm>
+#include <sstream>
+
+namespace nmtSample
+{
+LimitedSamplesDataReader::LimitedSamplesDataReader(int maxSamplesToRead, DataReader::ptr originalDataReader)
+    : gMaxSamplesToRead(maxSamplesToRead)
+    , gOriginalDataReader(originalDataReader)
+    , gCurrentPosition(0)
+{
+}
+
+int LimitedSamplesDataReader::read(
+    int samplesToRead, int maxInputSequenceLength, int* hInputData, int* hActualInputSequenceLengths)
+{
+    int limitedSmplesToRead = std::min(samplesToRead, std::max(gMaxSamplesToRead - gCurrentPosition, 0));
+    int samplesRead = gOriginalDataReader->read(
+        limitedSmplesToRead, maxInputSequenceLength, hInputData, hActualInputSequenceLengths);
+    gCurrentPosition += samplesRead;
+    return samplesRead;
+}
+
+void LimitedSamplesDataReader::reset()
+{
+    gOriginalDataReader->reset();
+    gCurrentPosition = 0;
+}
+
+std::string LimitedSamplesDataReader::getInfo()
+{
+    std::stringstream ss;
+    ss << "Limited Samples Reader, max samples = " << gMaxSamplesToRead
+       << ", original reader info: " << gOriginalDataReader->getInfo();
+    return ss.str();
+}
+} // namespace nmtSample
diff --git a/samples/opensource/sampleNMT/data/limitedSamplesDataReader.h b/samples/opensource/sampleNMT/data/limitedSamplesDataReader.h
new file mode 100644
index 00000000..75a1fdba
--- /dev/null
+++ b/samples/opensource/sampleNMT/data/limitedSamplesDataReader.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SAMPLE_NMT_LIMITED_SAMPLES_DATA_READER_
+#define SAMPLE_NMT_LIMITED_SAMPLES_DATA_READER_
+
+#include "dataReader.h"
+
+namespace nmtSample
+{
+/** \class LimitedSamplesDataReader
+ *
+ * \brief wraps another data reader and limits the number of samples to read
+ *
+ */
+class LimitedSamplesDataReader : public DataReader
+{
+public:
+    LimitedSamplesDataReader(int maxSamplesToRead, DataReader::ptr originalDataReader);
+
+    int read(int samplesToRead, int maxInputSequenceLength, int* hInputData, int* hActualInputSequenceLengths) override;
+
+    void reset() override;
+
+    std::string getInfo() override;
+
+private:
+    int gMaxSamplesToRead;
+    DataReader::ptr gOriginalDataReader;
+    int gCurrentPosition;
+};
+} // namespace nmtSample
+
+#endif // SAMPLE_NMT_LIMITED_SAMPLES_DATA_READER_
diff --git a/samples/opensource/sampleNMT/data/sequenceProperties.h b/samples/opensource/sampleNMT/data/sequenceProperties.h
new file mode 100644
index 00000000..5c73f639
--- /dev/null
+++ b/samples/opensource/sampleNMT/data/sequenceProperties.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SAMPLE_NMT_SEQUENCE_PROPERTIES_
+#define SAMPLE_NMT_SEQUENCE_PROPERTIES_
+
+#include <memory>
+
+namespace nmtSample
+{
+/** \class SequenceProperties
+ *
+ * \brief provides encoder/decoder relevant properties of sequences
+ *
+ */
+class SequenceProperties
+{
+public:
+    typedef std::shared_ptr<SequenceProperties> ptr;
+
+    SequenceProperties() = default;
+
+    virtual int getStartSequenceId() = 0;
+
+    virtual int getEndSequenceId() = 0;
+
+    virtual ~SequenceProperties() = default;
+};
+} // namespace nmtSample
+
+#endif // SAMPLE_NMT_SEQUENCE_PROPERTIES_
diff --git a/samples/opensource/sampleNMT/data/textReader.cpp b/samples/opensource/sampleNMT/data/textReader.cpp
new file mode 100644
index 00000000..8e0c539d
--- /dev/null
+++ b/samples/opensource/sampleNMT/data/textReader.cpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "textReader.h"
+
+#include <algorithm>
+#include <clocale>
+#include <fstream>
+#include <sstream>
+
+namespace nmtSample
+{
+TextReader::TextReader(std::shared_ptr<std::istream> textInput, Vocabulary::ptr vocabulary)
+    : mInput(textInput)
+    , mVocabulary(vocabulary)
+{
+}
+
+int TextReader::read(int samplesToRead, int maxInputSequenceLength, int* hInputData, int* hActualInputSequenceLengths)
+{
+    std::setlocale(LC_ALL, "en_US.UTF-8");
+    std::string line;
+
+    int lineCounter = 0;
+    while (lineCounter < samplesToRead && std::getline(*mInput, line))
+    {
+        std::istringstream ss(line);
+        std::string token;
+        int tokenCounter = 0;
+        while ((ss >> token) && (tokenCounter < maxInputSequenceLength))
+        {
+            hInputData[maxInputSequenceLength * lineCounter + tokenCounter] = mVocabulary->getId(token);
+            tokenCounter++;
+        }
+
+        hActualInputSequenceLengths[lineCounter] = tokenCounter;
+
+        // Fill unused values with valid vocabulary ID, it doesn't necessary have to be eos
+        std::fill(hInputData + maxInputSequenceLength * lineCounter + tokenCounter,
+            hInputData + maxInputSequenceLength * (lineCounter + 1), mVocabulary->getEndSequenceId());
+
+        lineCounter++;
+    }
+    return lineCounter;
+}
+
+void TextReader::reset()
+{
+    mInput->seekg(0, mInput->beg);
+}
+
+std::string TextReader::getInfo()
+{
+    std::stringstream ss;
+    ss << "Text Reader, vocabulary size = " << mVocabulary->getSize();
+    return ss.str();
+}
+} // namespace nmtSample
diff --git a/samples/opensource/sampleNMT/data/textReader.h b/samples/opensource/sampleNMT/data/textReader.h
new file mode 100644
index 00000000..15e28f88
--- /dev/null
+++ b/samples/opensource/sampleNMT/data/textReader.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SAMPLE_NMT_TEXT_READER_
+#define SAMPLE_NMT_TEXT_READER_
+
+#include "dataReader.h"
+#include "vocabulary.h"
+#include <istream>
+#include <memory>
+#include <string>
+
+namespace nmtSample
+{
+/** \class TextReader
+ *
+ * \brief reads sequences of data from input stream
+ *
+ */
+class TextReader : public DataReader
+{
+public:
+    TextReader(std::shared_ptr<std::istream> textInput, Vocabulary::ptr vocabulary);
+
+    int read(int samplesToRead, int maxInputSequenceLength, int* hInputData, int* hActualInputSequenceLengths) override;
+
+    void reset() override;
+
+    std::string getInfo() override;
+
+private:
+    std::shared_ptr<std::istream> mInput;
+    Vocabulary::ptr mVocabulary;
+};
+} // namespace nmtSample
+
+#endif // SAMPLE_NMT_TEXT_READER_
diff --git a/samples/opensource/sampleNMT/data/textWriter.cpp b/samples/opensource/sampleNMT/data/textWriter.cpp
new file mode 100644
index 00000000..ed2a556b
--- /dev/null
+++ b/samples/opensource/sampleNMT/data/textWriter.cpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "textWriter.h"
+
+#include <iostream>
+#include <sstream>
+
+namespace nmtSample
+{
+TextWriter::TextWriter(std::shared_ptr<std::ostream> textOnput, Vocabulary::ptr vocabulary)
+    : mOutput(textOnput)
+    , mVocabulary(vocabulary)
+{
+}
+
+void TextWriter::write(const int* hOutputData, int actualOutputSequenceLength, int actualInputSequenceLength)
+{
+    // if clean and handle BPE outputs is required
+    *mOutput << DataWriter::generateText(actualOutputSequenceLength, hOutputData, mVocabulary) << "\n";
+}
+
+void TextWriter::initialize() {}
+
+void TextWriter::finalize() {}
+
+std::string TextWriter::getInfo()
+{
+    std::stringstream ss;
+    ss << "Text Writer, vocabulary size = " << mVocabulary->getSize();
+    return ss.str();
+}
+} // namespace nmtSample
diff --git a/samples/opensource/sampleNMT/data/textWriter.h b/samples/opensource/sampleNMT/data/textWriter.h
new file mode 100644
index 00000000..645defe7
--- /dev/null
+++ b/samples/opensource/sampleNMT/data/textWriter.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SAMPLE_NMT_TEXT_WRITER_
+#define SAMPLE_NMT_TEXT_WRITER_
+
+#include <memory>
+#include <ostream>
+
+#include "dataWriter.h"
+#include "vocabulary.h"
+
+namespace nmtSample
+{
+/** \class TextReader
+ *
+ * \brief writes sequences of data into output stream
+ *
+ */
+class TextWriter : public DataWriter
+{
+public:
+    TextWriter(std::shared_ptr<std::ostream> textOnput, Vocabulary::ptr vocabulary);
+
+    void write(const int* hOutputData, int actualOutputSequenceLength, int actualInputSequenceLength) override;
+
+    void initialize() override;
+
+    void finalize() override;
+
+    std::string getInfo() override;
+
+    ~TextWriter() override = default;
+
+private:
+    std::shared_ptr<std::ostream> mOutput;
+    Vocabulary::ptr mVocabulary;
+};
+} // namespace nmtSample
+
+#endif // SAMPLE_NMT_TEXT_WRITER_
diff --git a/samples/opensource/sampleNMT/data/vocabulary.cpp b/samples/opensource/sampleNMT/data/vocabulary.cpp
new file mode 100644
index 00000000..ebaab836
--- /dev/null
+++ b/samples/opensource/sampleNMT/data/vocabulary.cpp
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "vocabulary.h"
+#include <assert.h>
+#include <clocale>
+#include <iostream>
+#include <istream>
+
+namespace nmtSample
+{
+const std::string Vocabulary::mSosStr = "<s>";
+const std::string Vocabulary::mEosStr = "</s>";
+const std::string Vocabulary::mUnkStr = "<unk>";
+
+Vocabulary::Vocabulary()
+    : mNumTokens(0)
+{
+}
+
+void Vocabulary::add(const std::string& token)
+{
+    assert(mTokenToId.find(token) == mTokenToId.end());
+    mTokenToId[token] = mNumTokens;
+    mIdToToken.push_back(token);
+    mNumTokens++;
+}
+
+int Vocabulary::getId(const std::string& token) const
+{
+    auto it = mTokenToId.find(token);
+    if (it != mTokenToId.end())
+        return it->second;
+    return mUnkId;
+}
+
+std::string Vocabulary::getToken(int id) const
+{
+    assert(id < mNumTokens);
+    return mIdToToken[id];
+}
+
+int Vocabulary::getSize() const
+{
+    return mNumTokens;
+}
+
+std::istream& operator>>(std::istream& input, Vocabulary& value)
+{
+    // stream should contain "<s>", "</s>" and "<unk>" tokens
+    std::setlocale(LC_ALL, "en_US.UTF-8");
+    std::string line;
+    std::string word;
+    while (input >> word)
+    {
+        value.add(word);
+    }
+
+    {
+        auto it = value.mTokenToId.find(Vocabulary::mSosStr);
+        assert(it != value.mTokenToId.end());
+        value.mSosId = it->second;
+    }
+
+    {
+        auto it = value.mTokenToId.find(Vocabulary::mEosStr);
+        assert(it != value.mTokenToId.end());
+        value.mEosId = it->second;
+    }
+
+    {
+        auto it = value.mTokenToId.find(Vocabulary::mUnkStr);
+        assert(it != value.mTokenToId.end());
+        value.mUnkId = it->second;
+    }
+
+    return input;
+}
+
+int Vocabulary::getStartSequenceId()
+{
+    return mSosId;
+}
+
+int Vocabulary::getEndSequenceId()
+{
+    return mEosId;
+}
+} // namespace nmtSample
\ No newline at end of file
diff --git a/samples/opensource/sampleNMT/data/vocabulary.h b/samples/opensource/sampleNMT/data/vocabulary.h
new file mode 100644
index 00000000..595cadb4
--- /dev/null
+++ b/samples/opensource/sampleNMT/data/vocabulary.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SAMPLE_NMT_VOCABULARY_
+#define SAMPLE_NMT_VOCABULARY_
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "sequenceProperties.h"
+
+namespace nmtSample
+{
+/** \class Vocabulary
+ *
+ * \brief String<->Id bijection storage
+ *
+ */
+class Vocabulary : public SequenceProperties
+{
+public:
+    typedef std::shared_ptr<Vocabulary> ptr;
+
+    Vocabulary();
+
+    friend std::istream& operator>>(std::istream& input, Vocabulary& value);
+
+    /**
+     * \brief add new token to vocabulary, ID is auto-generated
+     */
+    void add(const std::string& token);
+
+    /**
+     * \brief get the ID of the token
+     */
+    int getId(const std::string& token) const;
+
+    /**
+     * \brief get token by ID
+     */
+    std::string getToken(int id) const;
+
+    /**
+     * \brief get the number of elements in the vocabulary
+     */
+    int getSize() const;
+
+    int getStartSequenceId() override;
+
+    int getEndSequenceId() override;
+
+private:
+    static const std::string mSosStr;
+    static const std::string mUnkStr;
+    static const std::string mEosStr;
+
+    std::map<std::string, int> mTokenToId;
+    std::vector<std::string> mIdToToken;
+    int mNumTokens;
+
+    int mSosId;
+    int mEosId;
+    int mUnkId;
+};
+} // namespace nmtSample
+
+#endif // SAMPLE_NMT_VOCABULARY_
diff --git a/samples/opensource/sampleNMT/deviceBuffer.h b/samples/opensource/sampleNMT/deviceBuffer.h
new file mode 100644
index 00000000..bb549cfd
--- /dev/null
+++ b/samples/opensource/sampleNMT/deviceBuffer.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SAMPLE_NMT_DEVICE_BUFFER_
+#define SAMPLE_NMT_DEVICE_BUFFER_
+
+#include "cudaError.h"
+#include <cuda_runtime_api.h>
+#include <memory>
+
+namespace nmtSample
+{
+template <typename T>
+class DeviceBuffer
+{
+public:
+    typedef std::shared_ptr<DeviceBuffer<T>> ptr;
+
+    DeviceBuffer(size_t elementCount)
+        : mBuffer(nullptr)
+    {
+        CUDA_CHECK(cudaMalloc(&mBuffer, elementCount * sizeof(T)));
+    }
+
+    virtual ~DeviceBuffer()
+    {
+        if (mBuffer)
+        {
+            cudaFree(mBuffer);
+        }
+    }
+
+    operator T*()
+    {
+        return mBuffer;
+    }
+
+    operator const T*() const
+    {
+        return mBuffer;
+    }
+
+protected:
+    T* mBuffer;
+};
+} // namespace nmtSample
+
+#endif // SAMPLE_NMT_DEVICE_BUFFER_
diff --git a/samples/opensource/sampleNMT/get_newstest2015.sh b/samples/opensource/sampleNMT/get_newstest2015.sh
new file mode 100755
index 00000000..a2ff70ef
--- /dev/null
+++ b/samples/opensource/sampleNMT/get_newstest2015.sh
@@ -0,0 +1,102 @@
+#!/usr/bin/env bash
+
+# Copyright 2017 Google Inc.
+# Modifications copyright (C) 2019 Nvidia
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+ROOT_DIR="${1:-${PWD}}"
+OUTPUT_DIR="${ROOT_DIR}/intermediate_data"
+DATA_DIR="${ROOT_DIR}/data/nmt/deen"
+SCRIPTS_DIR="${ROOT_DIR}/scripts"
+BPE_CODES="${DATA_DIR}/bpe.32000"
+
+
+sgm_to_txt(){
+    ${SCRIPTS_DIR}/mosesdecoder/scripts/ems/support/input-from-sgm.perl < $1 > $2
+}
+
+tokenize(){
+    lan=$1
+    ${SCRIPTS_DIR}/mosesdecoder/scripts/tokenizer/tokenizer.perl -q -l ${lan} -threads 8 < $2 > $3
+}
+
+split_subwords(){
+    if [ ! -f "${BPE_CODES}" ]; then
+        echo "ERROR: ${BPE_CODES} not found. A codes file is required to split newstest into subwords."
+        exit 0
+    fi
+    ${SCRIPTS_DIR}/subword-nmt/subword_nmt/apply_bpe.py -c ${BPE_CODES} < $1 > $2
+}
+
+mkdir -p ${OUTPUT_DIR}
+mkdir -p ${SCRIPTS_DIR}
+mkdir -p ${DATA_DIR}
+
+# TBD: download bpe and vocab.
+if [ ! -f ${OUTPUT_DIR}/sampleNMT_data.tar.bz2 ]; then
+    echo "Downloading sample_nmt support data..."
+    curl -o ${OUTPUT_DIR}/sampleNMT_data.tar.bz2 \
+        https://developer.download.nvidia.com/compute/machine-learning/tensorrt/models/sampleNMT_data.tar.bz2 
+fi
+
+echo "Extracting sample_nmt support data..."
+tar -C ${DATA_DIR} -xf ${OUTPUT_DIR}/sampleNMT_data.tar.bz2
+
+
+# Clone required scripts
+# moses
+if [ ! -d "${SCRIPTS_DIR}/mosesdecoder" ]; then
+  echo "Cloning moses..."
+  git clone https://github.com/moses-smt/mosesdecoder.git "${SCRIPTS_DIR}/mosesdecoder"
+fi
+
+# subword-nmt
+if [ ! -d "${SCRIPTS_DIR}/subword-nmt" ]; then
+  echo "Cloning subword-nmt..."
+  git clone https://github.com/rsennrich/subword-nmt.git "${SCRIPTS_DIR}/subword-nmt"
+fi
+
+
+echo "Downloading newstests..."
+curl -o ${OUTPUT_DIR}/dev.tgz \
+  http://data.statmt.org/wmt16/translation-task/dev.tgz
+
+# Extract everything
+echo "Extracting newstest data..."
+mkdir -p "${OUTPUT_DIR}/SGM"
+tar -xzf "${OUTPUT_DIR}/dev.tgz" -C "${OUTPUT_DIR}/SGM"
+
+# Convert newstest2015 SGM file into raw text format
+echo "Converting newstest2015 SGM file into raw text format..."
+raw_de=${OUTPUT_DIR}/newstest2015.de
+raw_en=${OUTPUT_DIR}/newstest2015.en
+sgm_to_txt ${OUTPUT_DIR}/SGM/dev/newstest2015-deen-src.de.sgm ${raw_de}
+sgm_to_txt ${OUTPUT_DIR}/SGM/dev/newstest2015-deen-ref.en.sgm ${raw_en}
+
+# Tokenize newstest files
+echo "Tokenizing..."
+tok_de=${OUTPUT_DIR}/newstest2015.tok.de
+tok_en=${OUTPUT_DIR}/newstest2015.tok.en
+tokenize de ${raw_de} ${tok_de}
+tokenize en ${raw_en} ${tok_en}
+
+
+# Split into subwords
+echo "Splitting into subwords..."
+bpe_de=${DATA_DIR}/newstest2015.tok.bpe.32000.de
+bpe_en=${DATA_DIR}/newstest2015.tok.bpe.32000.en
+split_subwords ${tok_de} ${bpe_de}
+split_subwords ${tok_en} ${bpe_en}
+
diff --git a/samples/opensource/sampleNMT/model/alignment.h b/samples/opensource/sampleNMT/model/alignment.h
new file mode 100644
index 00000000..b2e87fb4
--- /dev/null
+++ b/samples/opensource/sampleNMT/model/alignment.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SAMPLE_NMT_ALIGNMENT_
+#define SAMPLE_NMT_ALIGNMENT_
+
+#include <memory>
+
+#include "../component.h"
+#include "NvInfer.h"
+
+namespace nmtSample
+{
+/** \class Alignment
+ *
+ * \brief represents the core of attention mechanism
+ *
+ */
+class Alignment : public Component
+{
+public:
+    typedef std::shared_ptr<Alignment> ptr;
+
+    Alignment() = default;
+
+    /**
+     * \brief add the alignment scores calculation to the network
+     */
+    virtual void addToModel(nvinfer1::INetworkDefinition* network, nvinfer1::ITensor* attentionKeys,
+        nvinfer1::ITensor* queryStates, nvinfer1::ITensor** alignmentScores)
+        = 0;
+
+    /**
+     * \brief add attention keys calculation (from source memory states) to the network
+     *
+     * The funtion is called if getAttentionKeySize returns positive value
+     */
+    virtual void addAttentionKeys(
+        nvinfer1::INetworkDefinition* network, nvinfer1::ITensor* memoryStates, nvinfer1::ITensor** attentionKeys)
+        = 0;
+
+    /**
+     * \brief get the size of the source states
+     */
+    virtual int getSourceStatesSize() = 0;
+
+    /**
+     * \brief get the size of the attention keys
+     */
+    virtual int getAttentionKeySize() = 0;
+
+    ~Alignment() override = default;
+};
+} // namespace nmtSample
+
+#endif // SAMPLE_NMT_ALIGNMENT_
diff --git a/samples/opensource/sampleNMT/model/attention.h b/samples/opensource/sampleNMT/model/attention.h
new file mode 100644
index 00000000..969f801a
--- /dev/null
+++ b/samples/opensource/sampleNMT/model/attention.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SAMPLE_NMT_ATTENTION_
+#define SAMPLE_NMT_ATTENTION_
+
+#include <memory>
+
+#include "../component.h"
+#include "NvInfer.h"
+
+namespace nmtSample
+{
+/** \class Attention
+ *
+ * \brief calculates attention vector from context and decoder output vectors
+ *
+ */
+class Attention : public Component
+{
+public:
+    typedef std::shared_ptr<Attention> ptr;
+
+    Attention() = default;
+
+    /**
+     * \brief add the attention vector calculation to the network
+     */
+    virtual void addToModel(nvinfer1::INetworkDefinition* network, nvinfer1::ITensor* inputFromDecoder,
+        nvinfer1::ITensor* context, nvinfer1::ITensor** attentionOutput)
+        = 0;
+
+    /**
+     * \brief get the size of the attention vector
+     */
+    virtual int getAttentionSize() = 0;
+
+    ~Attention() override = default;
+};
+} // namespace nmtSample
+
+#endif // SAMPLE_NMT_ATTENTION_
diff --git a/samples/opensource/sampleNMT/model/beamSearchPolicy.cpp b/samples/opensource/sampleNMT/model/beamSearchPolicy.cpp
new file mode 100644
index 00000000..1c2aed8e
--- /dev/null
+++ b/samples/opensource/sampleNMT/model/beamSearchPolicy.cpp
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "beamSearchPolicy.h"
+#ifdef _MSC_VER
+// Macro definition needed to avoid name collision with std::min/max and Windows.h min/max
+#define NOMINMAX
+#endif
+#include <algorithm>
+#include <cassert>
+#include <limits>
+#include <sstream>
+
+namespace nmtSample
+{
+BeamSearchPolicy::BeamSearchPolicy(
+    int endSequenceId, LikelihoodCombinationOperator::ptr likelihoodCombinationOperator, int beamWidth)
+    : mEndSequenceId(endSequenceId)
+    , mLikelihoodCombinationOperator(likelihoodCombinationOperator)
+    , mBeamWidth(beamWidth)
+{
+}
+
+void BeamSearchPolicy::initialize(int sampleCount, int* maxOutputSequenceLengths)
+{
+    mSampleCount = sampleCount;
+    mMaxOutputSequenceLengths.resize(mSampleCount);
+    std::copy(maxOutputSequenceLengths, maxOutputSequenceLengths + mSampleCount, &mMaxOutputSequenceLengths[0]);
+
+    mValidSamples.resize(mSampleCount);
+    std::fill(mValidSamples.begin(), mValidSamples.end(), true);
+
+    mCurrentLikelihoods.resize(mSampleCount * mBeamWidth);
+    std::fill(mCurrentLikelihoods.begin(), mCurrentLikelihoods.end(), mLikelihoodCombinationOperator->init());
+
+    mBeamSearchTable.clear();
+
+    mTimestepId = 0;
+
+    mCandidates.resize(mSampleCount);
+    mCandidateLikelihoods.resize(mSampleCount);
+    std::fill(mCandidateLikelihoods.begin(), mCandidateLikelihoods.end(),
+        mLikelihoodCombinationOperator->smallerThanMinimalLikelihood());
+}
+
+void BeamSearchPolicy::processTimestep(int validSampleCount, const float* hCombinedLikelihoods,
+    const int* hVocabularyIndices, const int* hRayOptionIndices, int* hSourceRayIndices, float* hSourceLikelihoods)
+{
+    ++mTimestepId;
+    mBeamSearchTable.resize(mTimestepId * mSampleCount * mBeamWidth);
+    auto baseBeamSearchTable = mBeamSearchTable.begin() + (mTimestepId - 1) * mSampleCount * mBeamWidth;
+
+    for (int sampleId = 0; sampleId < validSampleCount; ++sampleId)
+    {
+        auto currentSourceRayIndices = hSourceRayIndices + sampleId * mBeamWidth;
+        auto currentLikelihoods = hSourceLikelihoods + sampleId * mBeamWidth;
+        auto currentBeamSearchTable = baseBeamSearchTable + sampleId * mBeamWidth;
+
+        int rayId = 0;
+        if (mValidSamples[sampleId])
+        {
+            for (; rayId < mBeamWidth; ++rayId)
+            {
+                float optionCombinedLikelihood = hCombinedLikelihoods[sampleId * mBeamWidth + rayId];
+
+                // Check if the current candidate is already better than this option
+                if (optionCombinedLikelihood <= mCandidateLikelihoods[sampleId])
+                    break; // The remaining options are even worse
+
+                int optionOriginalRayId = hRayOptionIndices[sampleId * mBeamWidth + rayId] / mBeamWidth;
+                int optionVocabularyId = hVocabularyIndices[sampleId * mBeamWidth + rayId];
+
+                if ((optionVocabularyId == mEndSequenceId) || (mTimestepId >= mMaxOutputSequenceLengths[sampleId]))
+                {
+                    // We have a new candidate output sequence for the sample
+                    mCandidateLikelihoods[sampleId] = optionCombinedLikelihood;
+                    auto& candidate = mCandidates[sampleId];
+                    candidate.resize(mTimestepId);
+                    backtrack(mTimestepId - 2, sampleId, optionOriginalRayId, &candidate[0], mTimestepId - 2);
+                    candidate[mTimestepId - 1] = optionVocabularyId;
+                    break;
+                }
+
+                *(currentSourceRayIndices + rayId) = optionOriginalRayId;
+                *(currentLikelihoods + rayId) = optionCombinedLikelihood;
+                (currentBeamSearchTable + rayId)->vocabularyId = optionVocabularyId;
+                (currentBeamSearchTable + rayId)->backtrackId = optionOriginalRayId;
+            }
+
+            // No valid rays left for the sample
+            if (rayId == 0)
+                mValidSamples[sampleId] = false;
+        }
+
+        // Mark the remaining rays as invalid ones
+        for (; rayId < mBeamWidth; ++rayId)
+        {
+            *(currentSourceRayIndices + rayId) = 0;
+            *(currentLikelihoods + rayId) = mLikelihoodCombinationOperator->smallerThanMinimalLikelihood();
+            (currentBeamSearchTable + rayId)->vocabularyId = mEndSequenceId;
+            (currentBeamSearchTable + rayId)->backtrackId = 0;
+        }
+    }
+}
+
+int BeamSearchPolicy::getTailWithNoWorkRemaining()
+{
+    for (int sampleId = mSampleCount - 1; sampleId >= 0; --sampleId)
+    {
+        if (mValidSamples[sampleId])
+            return sampleId + 1;
+    }
+    return 0;
+}
+
+void BeamSearchPolicy::readGeneratedResult(
+    int sampleCount, int maxOutputSequenceLength, int* hOutputData, int* hActualOutputSequenceLengths)
+{
+    for (int sampleId = 0; sampleId < sampleCount; ++sampleId)
+    {
+        if (mCandidateLikelihoods[sampleId] > mLikelihoodCombinationOperator->smallerThanMinimalLikelihood())
+        {
+            // We have a candidate (finished sequence)
+            std::copy_n(mCandidates[sampleId].begin(),
+                std::min(static_cast<int>(mCandidates[sampleId].size()), maxOutputSequenceLength),
+                hOutputData + sampleId * maxOutputSequenceLength);
+            hActualOutputSequenceLengths[sampleId] = mCandidates[sampleId].size();
+        }
+        else
+        {
+            // We don't have a finished sequence generated, will output the unfinished one with the highest likelihood
+            assert(mValidSamples[sampleId]);
+            backtrack(mTimestepId - 1, sampleId, 0, hOutputData + sampleId * maxOutputSequenceLength,
+                maxOutputSequenceLength - 1);
+            hActualOutputSequenceLengths[sampleId] = mTimestepId;
+        }
+    }
+}
+
+void BeamSearchPolicy::backtrack(
+    int lastTimestepId, int sampleId, int lastTimestepRayId, int* hOutputData, int lastTimestepWriteId) const
+{
+    int rayId = lastTimestepRayId;
+    for (int timestepId = lastTimestepId; timestepId >= 0; --timestepId)
+    {
+        const auto& entry = mBeamSearchTable[(timestepId * mSampleCount + sampleId) * mBeamWidth + rayId];
+        rayId = entry.backtrackId;
+        if (timestepId <= lastTimestepWriteId)
+            hOutputData[timestepId] = entry.vocabularyId;
+    }
+}
+
+std::string BeamSearchPolicy::getInfo()
+{
+    std::stringstream ss;
+    ss << "Beam Search Policy, beam = " << mBeamWidth;
+    return ss.str();
+}
+} // namespace nmtSample
diff --git a/samples/opensource/sampleNMT/model/beamSearchPolicy.h b/samples/opensource/sampleNMT/model/beamSearchPolicy.h
new file mode 100644
index 00000000..4ca778a4
--- /dev/null
+++ b/samples/opensource/sampleNMT/model/beamSearchPolicy.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SAMPLE_NMT_BEAM_SEARCH_POLICY_
+#define SAMPLE_NMT_BEAM_SEARCH_POLICY_
+
+#include "../component.h"
+#include "likelihoodCombinationOperator.h"
+
+#include <vector>
+
+namespace nmtSample
+{
+/** \class BeamSearchPolicy
+ *
+ * \brief processes the results of one iteration of the generator with beam search and produces input for the next
+ * iteration
+ *
+ */
+class BeamSearchPolicy : public Component
+{
+public:
+    typedef std::shared_ptr<BeamSearchPolicy> ptr;
+
+    BeamSearchPolicy(
+        int endSequenceId, LikelihoodCombinationOperator::ptr likelihoodCombinationOperator, int beamWidth);
+
+    void initialize(int sampleCount, int* maxOutputSequenceLengths);
+
+    void processTimestep(int validSampleCount, const float* hCombinedLikelihoods, const int* hVocabularyIndices,
+        const int* hRayOptionIndices, int* hSourceRayIndices, float* hSourceLikelihoods);
+
+    int getTailWithNoWorkRemaining();
+
+    void readGeneratedResult(
+        int sampleCount, int maxOutputSequenceLength, int* hOutputData, int* hActualOutputSequenceLengths);
+
+    std::string getInfo() override;
+
+    ~BeamSearchPolicy() override = default;
+
+protected:
+    struct Ray
+    {
+        int vocabularyId;
+        int backtrackId;
+    };
+
+    void backtrack(
+        int lastTimestepId, int sampleId, int lastTimestepRayId, int* hOutputData, int lastTimestepWriteId) const;
+
+protected:
+    int mEndSequenceId;
+    LikelihoodCombinationOperator::ptr mLikelihoodCombinationOperator;
+    int mBeamWidth;
+    std::vector<bool> mValidSamples;
+    std::vector<float> mCurrentLikelihoods;
+    std::vector<Ray> mBeamSearchTable;
+    int mSampleCount;
+    std::vector<int> mMaxOutputSequenceLengths;
+    int mTimestepId;
+
+    std::vector<std::vector<int>> mCandidates;
+    std::vector<float> mCandidateLikelihoods;
+};
+} // namespace nmtSample
+
+#endif // SAMPLE_NMT_BEAM_SEARCH_POLICY_
diff --git a/samples/opensource/sampleNMT/model/componentWeights.cpp b/samples/opensource/sampleNMT/model/componentWeights.cpp
new file mode 100644
index 00000000..6f5274b6
--- /dev/null
+++ b/samples/opensource/sampleNMT/model/componentWeights.cpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "componentWeights.h"
+#include <cassert>
+#include <string>
+
+namespace nmtSample
+{
+std::istream& operator>>(std::istream& input, ComponentWeights& value)
+{
+    std::string footerString("trtsamplenmt");
+    size_t footerSize = sizeof(int32_t) + footerString.size();
+    char* footer = (char*) malloc(footerSize);
+
+    input.seekg(0, std::ios::end);
+    size_t fileSize = input.tellg();
+
+    input.seekg(-footerSize, std::ios::end);
+    input.read(footer, footerSize);
+
+    size_t metaDataCount = ((int32_t*) footer)[0];
+    std::string str(footer + sizeof(int32_t), footer + footerSize);
+    assert(footerString.compare(str) == 0);
+    free(footer);
+
+    input.seekg(-(footerSize + metaDataCount * sizeof(int32_t)), std::ios::end);
+    value.mMetaData.resize(metaDataCount);
+    size_t metaSize = metaDataCount * sizeof(int32_t);
+    input.read((char*) (&value.mMetaData[0]), metaSize);
+
+    size_t dataSize = fileSize - footerSize - metaSize;
+    input.seekg(0, input.beg);
+    value.mWeights.resize(dataSize);
+    input.read(&value.mWeights[0], dataSize);
+
+    return input;
+}
+} // namespace nmtSample
diff --git a/samples/opensource/sampleNMT/model/componentWeights.h b/samples/opensource/sampleNMT/model/componentWeights.h
new file mode 100644
index 00000000..4d1ea959
--- /dev/null
+++ b/samples/opensource/sampleNMT/model/componentWeights.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SAMPLE_NMT_COMPONENT_WEIGHTS_
+#define SAMPLE_NMT_COMPONENT_WEIGHTS_
+
+#include <iostream>
+#include <memory>
+#include <vector>
+
+namespace nmtSample
+{
+/** \class ComponentWeights
+ *
+ * \brief weights storage
+ *
+ */
+class ComponentWeights
+{
+public:
+    typedef std::shared_ptr<ComponentWeights> ptr;
+
+    ComponentWeights() = default;
+
+    friend std::istream& operator>>(std::istream& input, ComponentWeights& value);
+
+public:
+    std::vector<int> mMetaData;
+    std::vector<char> mWeights;
+};
+} // namespace nmtSample
+
+#endif // SAMPLE_NMT_COMPONENT_WEIGHTS_
diff --git a/samples/opensource/sampleNMT/model/contextNMT.cpp b/samples/opensource/sampleNMT/model/contextNMT.cpp
new file mode 100644
index 00000000..cfb0eb7f
--- /dev/null
+++ b/samples/opensource/sampleNMT/model/contextNMT.cpp
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "contextNMT.h"
+
+#include <cassert>
+#include <sstream>
+
+namespace nmtSample
+{
+void Context::addToModel(nvinfer1::INetworkDefinition* network, nvinfer1::ITensor* actualInputSequenceLengths,
+    nvinfer1::ITensor* memoryStates, nvinfer1::ITensor* alignmentScores, nvinfer1::ITensor** contextOutput)
+{
+    auto raggedSoftmaxLayer = network->addRaggedSoftMax(*alignmentScores, *actualInputSequenceLengths);
+    assert(raggedSoftmaxLayer != nullptr);
+    raggedSoftmaxLayer->setName("Context Ragged Softmax");
+    auto softmaxTensor = raggedSoftmaxLayer->getOutput(0);
+    assert(softmaxTensor != nullptr);
+
+    auto mmLayer = network->addMatrixMultiply(*softmaxTensor, false, *memoryStates, false);
+    assert(mmLayer != nullptr);
+    mmLayer->setName("Context Matrix Multiply");
+    *contextOutput = mmLayer->getOutput(0);
+    assert(*contextOutput != nullptr);
+}
+
+std::string Context::getInfo()
+{
+    return "Ragged softmax + Batch GEMM";
+}
+} // namespace nmtSample
diff --git a/samples/opensource/sampleNMT/model/contextNMT.h b/samples/opensource/sampleNMT/model/contextNMT.h
new file mode 100644
index 00000000..47f9baa4
--- /dev/null
+++ b/samples/opensource/sampleNMT/model/contextNMT.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SAMPLE_NMT_CONTEXT_
+#define SAMPLE_NMT_CONTEXT_
+
+#include <memory>
+
+#include "../component.h"
+#include "NvInfer.h"
+
+namespace nmtSample
+{
+/** \class Context
+ *
+ * \brief calculates context vector from raw alignment scores and memory states
+ *
+ */
+class Context : public Component
+{
+public:
+    typedef std::shared_ptr<Context> ptr;
+
+    Context() = default;
+
+    /**
+     * \brief add the context vector calculation to the network
+     */
+    void addToModel(nvinfer1::INetworkDefinition* network, nvinfer1::ITensor* actualInputSequenceLengths,
+        nvinfer1::ITensor* memoryStates, nvinfer1::ITensor* alignmentScores, nvinfer1::ITensor** contextOutput);
+
+    std::string getInfo() override;
+
+    ~Context() override = default;
+};
+} // namespace nmtSample
+
+#endif // SAMPLE_NMT_CONTEXT_
diff --git a/samples/opensource/sampleNMT/model/debugUtil.cpp b/samples/opensource/sampleNMT/model/debugUtil.cpp
new file mode 100644
index 00000000..bd878ab1
--- /dev/null
+++ b/samples/opensource/sampleNMT/model/debugUtil.cpp
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "debugUtil.h"
+
+#include <cassert>
+#include <cuda_runtime_api.h>
+
+#include "../cudaError.h"
+
+namespace nmtSample
+{
+std::list<DebugUtil::DumpTensorPlugin::ptr> DebugUtil::mPlugins;
+
+DebugUtil::DumpTensorPlugin::DumpTensorPlugin(std::shared_ptr<std::ostream> out)
+    : mOut(out)
+{
+}
+
+int DebugUtil::DumpTensorPlugin::getNbOutputs() const
+{
+    return 1;
+}
+
+nvinfer1::Dims DebugUtil::DumpTensorPlugin::getOutputDimensions(
+    int index, const nvinfer1::Dims* inputs, int nbInputDims)
+{
+    return inputs[0];
+}
+
+void DebugUtil::DumpTensorPlugin::configure(
+    const nvinfer1::Dims* inputDims, int nbInputs, const nvinfer1::Dims* outputDims, int nbOutputs, int maxBatchSize)
+{
+    mDims = inputDims[0];
+
+    *mOut << "Max batch size = " << maxBatchSize << std::endl;
+    *mOut << "Tensor dimensions = ";
+    mTensorVolume = 1;
+    for (int i = 0; i < mDims.nbDims; ++i)
+    {
+        if (i > 0)
+            *mOut << "x";
+        *mOut << mDims.d[i];
+        mTensorVolume *= mDims.d[i];
+    }
+    mElemsPerRow = 1;
+    for (int i = mDims.nbDims - 1; i >= 0; --i)
+    {
+        if (mElemsPerRow == 1)
+            mElemsPerRow *= mDims.d[i];
+    }
+    *mOut << std::endl;
+
+    mData = std::make_shared<PinnedHostBuffer<float>>(mTensorVolume * maxBatchSize);
+}
+
+int DebugUtil::DumpTensorPlugin::initialize()
+{
+    return 0;
+}
+
+void DebugUtil::DumpTensorPlugin::terminate()
+{
+    mOut.reset();
+    mData.reset();
+}
+
+size_t DebugUtil::DumpTensorPlugin::getWorkspaceSize(int maxBatchSize) const
+{
+    return 0;
+}
+
+int DebugUtil::DumpTensorPlugin::enqueue(
+    int batchSize, const void* const* inputs, void** outputs, void* workspace, cudaStream_t stream)
+{
+    int totalElems = batchSize * mTensorVolume;
+
+    CUDA_CHECK(cudaMemcpyAsync(*mData, inputs[0], totalElems * sizeof(float), cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+    CUDA_CHECK(cudaMemcpyAsync(outputs[0], inputs[0], totalElems * sizeof(float), cudaMemcpyDeviceToDevice, stream));
+
+    *mOut << "Batch size = " << batchSize << "\n";
+    int rowCount = totalElems / mElemsPerRow;
+    for (int rowId = 0; rowId < rowCount; ++rowId)
+    {
+        for (int i = 0; i < mElemsPerRow; ++i)
+        {
+            if (i > 0)
+                *mOut << " ";
+            *mOut << (*mData)[rowId * mElemsPerRow + i];
+        }
+        *mOut << "\n";
+    }
+    *mOut << std::endl;
+
+    return 0;
+}
+
+size_t DebugUtil::DumpTensorPlugin::getSerializationSize()
+{
+    assert(0);
+    return 0;
+}
+
+void DebugUtil::DumpTensorPlugin::serialize(void* buffer)
+{
+    assert(0);
+}
+
+void DebugUtil::addDumpTensorToStream(nvinfer1::INetworkDefinition* network, nvinfer1::ITensor* input,
+    nvinfer1::ITensor** output, std::shared_ptr<std::ostream> out)
+{
+    assert(!input->getBroadcastAcrossBatch());
+    auto plugin = std::make_shared<DumpTensorPlugin>(out);
+    nvinfer1::ITensor* inputTensors[] = {input};
+    auto pluginLayer = network->addPlugin(inputTensors, 1, *plugin);
+    assert(pluginLayer != nullptr);
+    *output = pluginLayer->getOutput(0);
+    assert(*output != nullptr);
+    mPlugins.push_back(plugin);
+}
+} // namespace nmtSample
diff --git a/samples/opensource/sampleNMT/model/debugUtil.h b/samples/opensource/sampleNMT/model/debugUtil.h
new file mode 100644
index 00000000..3c7efc49
--- /dev/null
+++ b/samples/opensource/sampleNMT/model/debugUtil.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SAMPLE_NMT_DEBUG_UTIL_
+#define SAMPLE_NMT_DEBUG_UTIL_
+
+#include "NvInfer.h"
+
+#include <list>
+#include <memory>
+#include <ostream>
+
+#include "../pinnedHostBuffer.h"
+
+namespace nmtSample
+{
+/** \class DebugUtil
+ *
+ * \brief container for static debug utility functions
+ *
+ */
+class DebugUtil
+{
+private:
+    class DumpTensorPlugin : public nvinfer1::IPlugin
+    {
+    public:
+        typedef std::shared_ptr<DumpTensorPlugin> ptr;
+
+        DumpTensorPlugin(std::shared_ptr<std::ostream> out);
+
+        ~DumpTensorPlugin() override = default;
+
+        int getNbOutputs() const override;
+
+        nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) override;
+
+        void configure(const nvinfer1::Dims* inputDims, int nbInputs, const nvinfer1::Dims* outputDims, int nbOutputs,
+            int maxBatchSize) override;
+
+        int initialize() override;
+
+        void terminate() override;
+
+        size_t getWorkspaceSize(int maxBatchSize) const override;
+
+        int enqueue(
+            int batchSize, const void* const* inputs, void** outputs, void* workspace, cudaStream_t stream) override;
+
+        size_t getSerializationSize() override;
+
+        void serialize(void* buffer) override;
+
+    private:
+        std::shared_ptr<std::ostream> mOut;
+        nvinfer1::Dims mDims;
+        int mTensorVolume;
+        int mElemsPerRow;
+        PinnedHostBuffer<float>::ptr mData;
+    };
+
+public:
+    static void addDumpTensorToStream(nvinfer1::INetworkDefinition* network, nvinfer1::ITensor* input,
+        nvinfer1::ITensor** output, std::shared_ptr<std::ostream> out);
+
+private:
+    static std::list<DumpTensorPlugin::ptr> mPlugins;
+};
+} // namespace nmtSample
+
+#endif // SAMPLE_NMT_DEBUG_UTIL_
diff --git a/samples/opensource/sampleNMT/model/decoder.h b/samples/opensource/sampleNMT/model/decoder.h
new file mode 100644
index 00000000..52075080
--- /dev/null
+++ b/samples/opensource/sampleNMT/model/decoder.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SAMPLE_NMT_DECODER_
+#define SAMPLE_NMT_DECODER_
+
+#include <memory>
+#include <vector>
+
+#include "../component.h"
+#include "NvInfer.h"
+
+namespace nmtSample
+{
+/** \class Decoder
+ *
+ * \brief encodes single input into output states
+ *
+ */
+class Decoder : public Component
+{
+public:
+    typedef std::shared_ptr<Decoder> ptr;
+
+    Decoder() = default;
+
+    /**
+     * \brief add the memory, cell, and hidden states to the network
+     */
+    virtual void addToModel(nvinfer1::INetworkDefinition* network, nvinfer1::ITensor* inputData,
+        nvinfer1::ITensor** inputStates, nvinfer1::ITensor** outputData, nvinfer1::ITensor** outputStates)
+        = 0;
+
+    /**
+     * \brief get the sizes (vector of them) of the hidden state vectors
+     */
+    virtual std::vector<nvinfer1::Dims> getStateSizes() = 0;
+
+    ~Decoder() override = default;
+};
+} // namespace nmtSample
+
+#endif // SAMPLE_NMT_DECODER_
diff --git a/samples/opensource/sampleNMT/model/embedder.h b/samples/opensource/sampleNMT/model/embedder.h
new file mode 100644
index 00000000..4a963e08
--- /dev/null
+++ b/samples/opensource/sampleNMT/model/embedder.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SAMPLE_NMT_EMBEDDER_
+#define SAMPLE_NMT_EMBEDDER_
+
+#include <memory>
+
+#include "../component.h"
+#include "NvInfer.h"
+
+namespace nmtSample
+{
+/** \class Embedder
+ *
+ * \brief projects 1-hot vectors (represented as a vector with indices) into dense embedding space
+ *
+ */
+class Embedder : public Component
+{
+public:
+    typedef std::shared_ptr<Embedder> ptr;
+
+    Embedder() = default;
+
+    /**
+     * \brief add the embedding vector calculation to the network
+     */
+    virtual void addToModel(nvinfer1::INetworkDefinition* network, nvinfer1::ITensor* input, nvinfer1::ITensor** output)
+        = 0;
+
+    /**
+     * \brief get the upper bound for the possible values of indices
+     */
+    virtual int getInputDimensionSize() = 0;
+
+    ~Embedder() override = default;
+};
+} // namespace nmtSample
+
+#endif // SAMPLE_NMT_EMBEDDER_
diff --git a/samples/opensource/sampleNMT/model/encoder.h b/samples/opensource/sampleNMT/model/encoder.h
new file mode 100644
index 00000000..a5fe09db
--- /dev/null
+++ b/samples/opensource/sampleNMT/model/encoder.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SAMPLE_NMT_ENCODER_
+#define SAMPLE_NMT_ENCODER_
+
+#include <memory>
+#include <vector>
+
+#include "../component.h"
+#include "NvInfer.h"
+
+namespace nmtSample
+{
+/** \class Encoder
+ *
+ * \brief encodes input sentences into output states
+ *
+ */
+class Encoder : public Component
+{
+public:
+    typedef std::shared_ptr<Encoder> ptr;
+
+    Encoder() = default;
+
+    /**
+     * \brief add the memory and last timestep states to the network
+     * lastTimestepHiddenStates is the pointer to the tensor where the encoder stores all layer hidden states for the
+     * last timestep (which is dependent on the sample), the function should define the tensor, it could be nullptr
+     * indicating these data are not needed
+     */
+    virtual void addToModel(nvinfer1::INetworkDefinition* network, int maxInputSequenceLength,
+        nvinfer1::ITensor* inputEmbeddedData, nvinfer1::ITensor* actualInputSequenceLengths,
+        nvinfer1::ITensor** inputStates, nvinfer1::ITensor** memoryStates, nvinfer1::ITensor** lastTimestepStates)
+        = 0;
+
+    /**
+     * \brief get the size of the memory state vector
+     */
+    virtual int getMemoryStatesSize() = 0;
+
+    /**
+     * \brief get the sizes (vector of them) of the hidden state vectors
+     */
+    virtual std::vector<nvinfer1::Dims> getStateSizes() = 0;
+
+    ~Encoder() override = default;
+};
+} // namespace nmtSample
+
+#endif // SAMPLE_NMT_ENCODER_
diff --git a/samples/opensource/sampleNMT/model/likelihood.h b/samples/opensource/sampleNMT/model/likelihood.h
new file mode 100644
index 00000000..fa6dbf10
--- /dev/null
+++ b/samples/opensource/sampleNMT/model/likelihood.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SAMPLE_NMT_LIKELIHOOD_
+#define SAMPLE_NMT_LIKELIHOOD_
+
+#include <memory>
+
+#include "../component.h"
+#include "NvInfer.h"
+#include "likelihoodCombinationOperator.h"
+
+namespace nmtSample
+{
+/** \class Likelihood
+ *
+ * \brief calculates likelihood and TopK indices for the raw input logits
+ *
+ */
+class Likelihood : public Component
+{
+public:
+    typedef std::shared_ptr<Likelihood> ptr;
+
+    Likelihood() = default;
+
+    virtual LikelihoodCombinationOperator::ptr getLikelihoodCombinationOperator() const = 0;
+
+    /**
+     * \brief add calculation of likelihood and TopK indices to the network
+     */
+    virtual void addToModel(nvinfer1::INetworkDefinition* network, int beamWidth, nvinfer1::ITensor* inputLogits,
+        nvinfer1::ITensor* inputLikelihoods, nvinfer1::ITensor** newCombinedLikelihoods,
+        nvinfer1::ITensor** newRayOptionIndices, nvinfer1::ITensor** newVocabularyIndices)
+        = 0;
+
+    ~Likelihood() override = default;
+};
+} // namespace nmtSample
+
+#endif // SAMPLE_NMT_LIKELIHOOD_
diff --git a/samples/opensource/sampleNMT/model/likelihoodCombinationOperator.h b/samples/opensource/sampleNMT/model/likelihoodCombinationOperator.h
new file mode 100644
index 00000000..651c1f3e
--- /dev/null
+++ b/samples/opensource/sampleNMT/model/likelihoodCombinationOperator.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SAMPLE_NMT_LIKELIHOOD_COMBINATION_
+#define SAMPLE_NMT_LIKELIHOOD_COMBINATION_
+
+#include <memory>
+
+namespace nmtSample
+{
+class LikelihoodCombinationOperator
+{
+public:
+    typedef std::shared_ptr<LikelihoodCombinationOperator> ptr;
+
+    // The  return value should be less or equal to rayLikelihood
+    virtual float combine(float rayLikelihood, float optionLikelihood) const = 0;
+
+    virtual float init() const = 0;
+
+    virtual float smallerThanMinimalLikelihood() const = 0;
+
+    virtual ~LikelihoodCombinationOperator() = default;
+
+protected:
+    LikelihoodCombinationOperator() = default;
+};
+} // namespace nmtSample
+
+#endif // SAMPLE_NMT_LIKELIHOOD_COMBINATION_
diff --git a/samples/opensource/sampleNMT/model/lstmDecoder.cpp b/samples/opensource/sampleNMT/model/lstmDecoder.cpp
new file mode 100644
index 00000000..8ec7face
--- /dev/null
+++ b/samples/opensource/sampleNMT/model/lstmDecoder.cpp
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "lstmDecoder.h"
+
+#include "trtUtil.h"
+
+#include "debugUtil.h"
+#include <fstream>
+
+#include <cassert>
+#include <sstream>
+
+namespace nmtSample
+{
+LSTMDecoder::LSTMDecoder(ComponentWeights::ptr weights)
+    : mWeights(weights)
+{
+    // please refer to chpt_to_bin.py for the details on the format
+    assert(mWeights->mMetaData.size() >= 4);
+    nvinfer1::DataType dataType = static_cast<nvinfer1::DataType>(mWeights->mMetaData[0]);
+    assert(dataType == nvinfer1::DataType::kFLOAT);
+    mRNNKind = mWeights->mMetaData[1];
+    mNumLayers = mWeights->mMetaData[2];
+    mNumUnits = mWeights->mMetaData[3];
+    size_t elementSize = inferTypeToBytes(dataType);
+    // compute weights offsets
+    size_t dataSize = 2 * mNumUnits;
+    size_t kernelOffset = 0;
+    size_t biasStartOffset = ((4 * dataSize + 4 * mNumUnits) * mNumUnits) * elementSize
+        + 8 * mNumUnits * mNumUnits * (mNumLayers - 1) * elementSize;
+    size_t biasOffset = biasStartOffset;
+    int numGates = 8;
+    for (int layerIndex = 0; layerIndex < mNumLayers; layerIndex++)
+    {
+        for (int gateIndex = 0; gateIndex < numGates; gateIndex++)
+        {
+            // encoder input size == mNumUnits
+            int64_t inputSize = ((layerIndex == 0) && (gateIndex < 4)) ? dataSize : mNumUnits;
+            nvinfer1::Weights gateKernelWeights{dataType, &mWeights->mWeights[0] + kernelOffset, inputSize * mNumUnits};
+            nvinfer1::Weights gateBiasWeights{dataType, &mWeights->mWeights[0] + biasOffset, mNumUnits};
+            mGateKernelWeights.push_back(std::move(gateKernelWeights));
+            mGateBiasWeights.push_back(std::move(gateBiasWeights));
+            kernelOffset = kernelOffset + inputSize * mNumUnits * elementSize;
+            biasOffset = biasOffset + mNumUnits * elementSize;
+        }
+    }
+    assert(kernelOffset + biasOffset - biasStartOffset == mWeights->mWeights.size());
+}
+
+void LSTMDecoder::addToModel(nvinfer1::INetworkDefinition* network, nvinfer1::ITensor* inputEmbeddedData,
+    nvinfer1::ITensor** inputStates, nvinfer1::ITensor** outputData, nvinfer1::ITensor** outputStates)
+{
+    int beamWidth;
+    int inputWidth;
+    {
+        auto dims = inputEmbeddedData->getDimensions();
+        assert(dims.nbDims == 2);
+        assert(dims.type[0] == nvinfer1::DimensionType::kINDEX);
+        beamWidth = dims.d[0];
+        assert(dims.type[1] == nvinfer1::DimensionType::kCHANNEL);
+        inputWidth = dims.d[1];
+    }
+
+    nvinfer1::ITensor* shuffledInput;
+    {
+        auto shuffleLayer = network->addShuffle(*inputEmbeddedData);
+        assert(shuffleLayer != nullptr);
+        shuffleLayer->setName("Reshape input for LSTM decoder");
+        nvinfer1::Dims shuffleDims{3, {beamWidth, 1, inputWidth},
+            {nvinfer1::DimensionType::kINDEX, nvinfer1::DimensionType::kSEQUENCE, nvinfer1::DimensionType::kCHANNEL}};
+        shuffleLayer->setReshapeDimensions(shuffleDims);
+        shuffledInput = shuffleLayer->getOutput(0);
+        assert(shuffledInput != nullptr);
+    }
+
+    auto decoderLayer = network->addRNNv2(*shuffledInput, mNumLayers, mNumUnits, 1, nvinfer1::RNNOperation::kLSTM);
+    assert(decoderLayer != nullptr);
+    decoderLayer->setName("LSTM decoder");
+
+    decoderLayer->setInputMode(nvinfer1::RNNInputMode::kLINEAR);
+    decoderLayer->setDirection(nvinfer1::RNNDirection::kUNIDIRECTION);
+
+    std::vector<nvinfer1::RNNGateType> gateOrder({nvinfer1::RNNGateType::kFORGET, nvinfer1::RNNGateType::kINPUT,
+        nvinfer1::RNNGateType::kCELL, nvinfer1::RNNGateType::kOUTPUT});
+    for (size_t i = 0; i < mGateKernelWeights.size(); i++)
+    {
+        // we have 4 + 4 gates
+        bool isW = ((i % 8) < 4);
+        decoderLayer->setWeightsForGate(i / 8, gateOrder[i % 4], isW, mGateKernelWeights[i]);
+        decoderLayer->setBiasForGate(i / 8, gateOrder[i % 4], isW, mGateBiasWeights[i]);
+    }
+
+    decoderLayer->setHiddenState(*inputStates[0]);
+    decoderLayer->setCellState(*inputStates[1]);
+    *outputData = decoderLayer->getOutput(0);
+    assert(*outputData != nullptr);
+
+    {
+        auto shuffleLayer = network->addShuffle(**outputData);
+        assert(shuffleLayer != nullptr);
+        shuffleLayer->setName("Reshape output from LSTM decoder");
+        nvinfer1::Dims shuffleDims{
+            2, {beamWidth, mNumUnits}, {nvinfer1::DimensionType::kINDEX, nvinfer1::DimensionType::kCHANNEL}};
+        shuffleLayer->setReshapeDimensions(shuffleDims);
+        auto shuffledOutput = shuffleLayer->getOutput(0);
+        assert(shuffledOutput != nullptr);
+        *outputData = shuffledOutput;
+    }
+
+    // Per layer hidden output
+    outputStates[0] = decoderLayer->getOutput(1);
+    assert(outputStates[0] != nullptr);
+
+    // Per layer cell output
+    outputStates[1] = decoderLayer->getOutput(2);
+    assert(outputStates[1] != nullptr);
+}
+
+std::vector<nvinfer1::Dims> LSTMDecoder::getStateSizes()
+{
+    nvinfer1::Dims hiddenStateDims{
+        2, {mNumLayers, mNumUnits}, {nvinfer1::DimensionType::kSPATIAL, nvinfer1::DimensionType::kCHANNEL}};
+    nvinfer1::Dims cellStateDims{
+        2, {mNumLayers, mNumUnits}, {nvinfer1::DimensionType::kSPATIAL, nvinfer1::DimensionType::kCHANNEL}};
+    return std::vector<nvinfer1::Dims>({hiddenStateDims, cellStateDims});
+}
+
+std::string LSTMDecoder::getInfo()
+{
+    std::stringstream ss;
+    ss << "LSTM Decoder, num layers = " << mNumLayers << ", num units = " << mNumUnits;
+    return ss.str();
+}
+} // namespace nmtSample
diff --git a/samples/opensource/sampleNMT/model/lstmDecoder.h b/samples/opensource/sampleNMT/model/lstmDecoder.h
new file mode 100644
index 00000000..5248ddff
--- /dev/null
+++ b/samples/opensource/sampleNMT/model/lstmDecoder.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SAMPLE_NMT_LSTM_DECODER_
+#define SAMPLE_NMT_LSTM_DECODER_
+
+#include "decoder.h"
+
+#include "componentWeights.h"
+
+namespace nmtSample
+{
+/** \class LSTMDecoder
+ *
+ * \brief encodes single input into output states with LSTM
+ *
+ */
+class LSTMDecoder : public Decoder
+{
+public:
+    LSTMDecoder(ComponentWeights::ptr weights);
+
+    void addToModel(nvinfer1::INetworkDefinition* network, nvinfer1::ITensor* inputEmbeddedData,
+        nvinfer1::ITensor** inputStates, nvinfer1::ITensor** outputData, nvinfer1::ITensor** outputStates) override;
+
+    std::vector<nvinfer1::Dims> getStateSizes() override;
+
+    std::string getInfo() override;
+
+    ~LSTMDecoder() override = default;
+
+protected:
+    ComponentWeights::ptr mWeights;
+    std::vector<nvinfer1::Weights> mGateKernelWeights;
+    std::vector<nvinfer1::Weights> mGateBiasWeights;
+    bool mRNNKind;
+    int mNumLayers;
+    int mNumUnits;
+};
+} // namespace nmtSample
+
+#endif // SAMPLE_NMT_LSTM_DECODER_
diff --git a/samples/opensource/sampleNMT/model/lstmEncoder.cpp b/samples/opensource/sampleNMT/model/lstmEncoder.cpp
new file mode 100644
index 00000000..f156dec3
--- /dev/null
+++ b/samples/opensource/sampleNMT/model/lstmEncoder.cpp
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "lstmEncoder.h"
+#include "trtUtil.h"
+
+#include <cassert>
+#include <sstream>
+
+namespace nmtSample
+{
+
+LSTMEncoder::LSTMEncoder(ComponentWeights::ptr weights)
+    : mWeights(weights)
+{
+    // please refer to chpt_to_bin.py for the details on the format
+    assert(mWeights->mMetaData.size() >= 4);
+    const nvinfer1::DataType dataType = static_cast<nvinfer1::DataType>(mWeights->mMetaData[0]);
+    assert(dataType == nvinfer1::DataType::kFLOAT);
+    mRNNKind = mWeights->mMetaData[1];
+    mNumLayers = mWeights->mMetaData[2];
+    mNumUnits = mWeights->mMetaData[3];
+
+    size_t elementSize = inferTypeToBytes(dataType);
+    // compute weights offsets
+    size_t kernelOffset = 0;
+    size_t biasStartOffset = ((4 * mNumUnits + 4 * mNumUnits) * mNumUnits * mNumLayers) * elementSize;
+    size_t biasOffset = biasStartOffset;
+    int numGates = 8;
+    for (int layerIndex = 0; layerIndex < mNumLayers; layerIndex++)
+    {
+        for (int gateIndex = 0; gateIndex < numGates; gateIndex++)
+        {
+            // encoder input size == mNumUnits
+            int64_t inputSize = ((layerIndex == 0) && (gateIndex < 4)) ? mNumUnits : mNumUnits;
+            nvinfer1::Weights gateKernelWeights{dataType, &mWeights->mWeights[0] + kernelOffset, inputSize * mNumUnits};
+            nvinfer1::Weights gateBiasWeights{dataType, &mWeights->mWeights[0] + biasOffset, mNumUnits};
+            mGateKernelWeights.push_back(std::move(gateKernelWeights));
+            mGateBiasWeights.push_back(std::move(gateBiasWeights));
+            kernelOffset = kernelOffset + inputSize * mNumUnits * elementSize;
+            biasOffset = biasOffset + mNumUnits * elementSize;
+        }
+    }
+    assert(kernelOffset + biasOffset - biasStartOffset == mWeights->mWeights.size());
+}
+
+void LSTMEncoder::addToModel(nvinfer1::INetworkDefinition* network, int maxInputSequenceLength,
+    nvinfer1::ITensor* inputEmbeddedData, nvinfer1::ITensor* actualInputSequenceLengths,
+    nvinfer1::ITensor** inputStates, nvinfer1::ITensor** memoryStates, nvinfer1::ITensor** lastTimestepStates)
+{
+    auto encoderLayer = network->addRNNv2(
+        *inputEmbeddedData, mNumLayers, mNumUnits, maxInputSequenceLength, nvinfer1::RNNOperation::kLSTM);
+    assert(encoderLayer != nullptr);
+    encoderLayer->setName("LSTM encoder");
+
+    encoderLayer->setSequenceLengths(*actualInputSequenceLengths);
+    encoderLayer->setInputMode(nvinfer1::RNNInputMode::kLINEAR);
+    encoderLayer->setDirection(nvinfer1::RNNDirection::kUNIDIRECTION);
+
+    std::vector<nvinfer1::RNNGateType> gateOrder({nvinfer1::RNNGateType::kFORGET, nvinfer1::RNNGateType::kINPUT,
+        nvinfer1::RNNGateType::kCELL, nvinfer1::RNNGateType::kOUTPUT});
+    for (size_t i = 0; i < mGateKernelWeights.size(); i++)
+    {
+        // we have 4 + 4 gates
+        bool isW = ((i % 8) < 4);
+        encoderLayer->setWeightsForGate(i / 8, gateOrder[i % 4], isW, mGateKernelWeights[i]);
+        encoderLayer->setBiasForGate(i / 8, gateOrder[i % 4], isW, mGateBiasWeights[i]);
+    }
+
+    encoderLayer->setHiddenState(*inputStates[0]);
+    encoderLayer->setCellState(*inputStates[1]);
+    *memoryStates = encoderLayer->getOutput(0);
+    assert(*memoryStates != nullptr);
+
+    if (lastTimestepStates)
+    {
+        // Per layer hidden output
+        lastTimestepStates[0] = encoderLayer->getOutput(1);
+        assert(lastTimestepStates[0] != nullptr);
+
+        // Per layer cell output
+        lastTimestepStates[1] = encoderLayer->getOutput(2);
+        assert(lastTimestepStates[1] != nullptr);
+    }
+}
+
+int LSTMEncoder::getMemoryStatesSize()
+{
+    return mNumUnits;
+}
+
+std::vector<nvinfer1::Dims> LSTMEncoder::getStateSizes()
+{
+    nvinfer1::Dims hiddenStateDims{
+        2, {mNumLayers, mNumUnits}, {nvinfer1::DimensionType::kSPATIAL, nvinfer1::DimensionType::kCHANNEL}};
+    nvinfer1::Dims cellStateDims{
+        2, {mNumLayers, mNumUnits}, {nvinfer1::DimensionType::kSPATIAL, nvinfer1::DimensionType::kCHANNEL}};
+    return std::vector<nvinfer1::Dims>({hiddenStateDims, cellStateDims});
+}
+
+std::string LSTMEncoder::getInfo()
+{
+    std::stringstream ss;
+    ss << "LSTM Encoder, num layers = " << mNumLayers << ", num units = " << mNumUnits;
+    return ss.str();
+}
+} // namespace nmtSample
diff --git a/samples/opensource/sampleNMT/model/lstmEncoder.h b/samples/opensource/sampleNMT/model/lstmEncoder.h
new file mode 100644
index 00000000..26ee2e97
--- /dev/null
+++ b/samples/opensource/sampleNMT/model/lstmEncoder.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SAMPLE_NMT_LSTM_ENCODER_
+#define SAMPLE_NMT_LSTM_ENCODER_
+
+#include "encoder.h"
+
+#include "componentWeights.h"
+
+namespace nmtSample
+{
+/** \class LSTMEncoder
+ *
+ * \brief encodes input sentences into output states using LSTM
+ *
+ */
+class LSTMEncoder : public Encoder
+{
+public:
+    LSTMEncoder(ComponentWeights::ptr weights);
+
+    void addToModel(nvinfer1::INetworkDefinition* network, int maxInputSequenceLength,
+        nvinfer1::ITensor* inputEmbeddedData, nvinfer1::ITensor* actualInputSequenceLengths,
+        nvinfer1::ITensor** inputStates, nvinfer1::ITensor** memoryStates,
+        nvinfer1::ITensor** lastTimestepStates) override;
+
+    int getMemoryStatesSize() override;
+
+    std::vector<nvinfer1::Dims> getStateSizes() override;
+
+    std::string getInfo() override;
+
+    ~LSTMEncoder() override = default;
+
+protected:
+    ComponentWeights::ptr mWeights;
+    std::vector<nvinfer1::Weights> mGateKernelWeights;
+    std::vector<nvinfer1::Weights> mGateBiasWeights;
+    bool mRNNKind;
+    int mNumLayers;
+    int mNumUnits;
+};
+} // namespace nmtSample
+
+#endif // SAMPLE_NMT_LSTM_ENCODER_
diff --git a/samples/opensource/sampleNMT/model/multiplicativeAlignment.cpp b/samples/opensource/sampleNMT/model/multiplicativeAlignment.cpp
new file mode 100644
index 00000000..607d5837
--- /dev/null
+++ b/samples/opensource/sampleNMT/model/multiplicativeAlignment.cpp
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "multiplicativeAlignment.h"
+
+#include <cassert>
+#include <sstream>
+
+namespace nmtSample
+{
+MultiplicativeAlignment::MultiplicativeAlignment(ComponentWeights::ptr weights)
+    : mWeights(weights)
+{
+    // please refer to chpt_to_bin.py for the details on the format
+    assert(mWeights->mMetaData.size() >= 3);
+    mKernelWeights.type = static_cast<nvinfer1::DataType>(mWeights->mMetaData[0]);
+    assert(mKernelWeights.type == nvinfer1::DataType::kFLOAT);
+    mInputChannelCount = mWeights->mMetaData[1];
+    mOutputChannelCount = mWeights->mMetaData[2];
+
+    mKernelWeights.values = (void*) (&mWeights->mWeights[0]);
+    mKernelWeights.count = mInputChannelCount * mOutputChannelCount;
+}
+
+void MultiplicativeAlignment::addToModel(nvinfer1::INetworkDefinition* network, nvinfer1::ITensor* attentionKeys,
+    nvinfer1::ITensor* queryStates, nvinfer1::ITensor** alignmentScores)
+{
+    auto mmLayer = network->addMatrixMultiply(*queryStates, false, *attentionKeys, true);
+    assert(mmLayer != nullptr);
+    mmLayer->setName("Raw Alignment Scores MM (Queries x Keys) in multiplicative attention");
+    *alignmentScores = mmLayer->getOutput(0);
+    assert(*alignmentScores != nullptr);
+}
+
+void MultiplicativeAlignment::addAttentionKeys(
+    nvinfer1::INetworkDefinition* network, nvinfer1::ITensor* memoryStates, nvinfer1::ITensor** attentionKeys)
+{
+    nvinfer1::Dims weightDims{2, {mInputChannelCount, mOutputChannelCount},
+        {nvinfer1::DimensionType::kCHANNEL, nvinfer1::DimensionType::kCHANNEL}};
+    auto constLayer = network->addConstant(weightDims, mKernelWeights);
+    assert(constLayer != nullptr);
+    constLayer->setName("Matrix in multiplicative attention");
+    auto weights = constLayer->getOutput(0);
+    assert(weights != nullptr);
+
+    auto mmLayer = network->addMatrixMultiply(*memoryStates, false, *weights, false);
+    assert(mmLayer != nullptr);
+    mmLayer->setName("Attention Keys MM in multiplicative attention");
+    *attentionKeys = mmLayer->getOutput(0);
+    assert(*attentionKeys != nullptr);
+}
+
+int MultiplicativeAlignment::getSourceStatesSize()
+{
+    return mInputChannelCount;
+}
+
+int MultiplicativeAlignment::getAttentionKeySize()
+{
+    return mOutputChannelCount;
+}
+
+std::string MultiplicativeAlignment::getInfo()
+{
+    std::stringstream ss;
+    ss << "Multiplicative Alignment, source states size = " << mInputChannelCount
+       << ", attention keys size = " << mOutputChannelCount;
+    return ss.str();
+}
+} // namespace nmtSample
diff --git a/samples/opensource/sampleNMT/model/multiplicativeAlignment.h b/samples/opensource/sampleNMT/model/multiplicativeAlignment.h
new file mode 100644
index 00000000..44fd0017
--- /dev/null
+++ b/samples/opensource/sampleNMT/model/multiplicativeAlignment.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SAMPLE_NMT_MULTIPLICATIVE_ALIGNMENT_
+#define SAMPLE_NMT_MULTIPLICATIVE_ALIGNMENT_
+
+#include "alignment.h"
+
+#include "componentWeights.h"
+
+namespace nmtSample
+{
+/** \class MultiplicativeAlignment
+ *
+ * \brief alignment scores from Luong attention mechanism
+ *
+ */
+class MultiplicativeAlignment : public Alignment
+{
+public:
+    MultiplicativeAlignment(ComponentWeights::ptr weights);
+
+    void addToModel(nvinfer1::INetworkDefinition* network, nvinfer1::ITensor* attentionKeys,
+        nvinfer1::ITensor* queryStates, nvinfer1::ITensor** alignmentScores) override;
+
+    void addAttentionKeys(nvinfer1::INetworkDefinition* network, nvinfer1::ITensor* memoryStates,
+        nvinfer1::ITensor** attentionKeys) override;
+
+    int getSourceStatesSize() override;
+
+    int getAttentionKeySize() override;
+
+    std::string getInfo() override;
+
+    ~MultiplicativeAlignment() override = default;
+
+protected:
+    ComponentWeights::ptr mWeights;
+    nvinfer1::Weights mKernelWeights;
+    int mInputChannelCount;
+    int mOutputChannelCount;
+};
+} // namespace nmtSample
+
+#endif // SAMPLE_NMT_MULTIPLICATIVE_ALIGNMENT_
diff --git a/samples/opensource/sampleNMT/model/projection.h b/samples/opensource/sampleNMT/model/projection.h
new file mode 100644
index 00000000..8c2d766b
--- /dev/null
+++ b/samples/opensource/sampleNMT/model/projection.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SAMPLE_NMT_PROJECTION_
+#define SAMPLE_NMT_PROJECTION_
+
+#include <memory>
+
+#include "../component.h"
+#include "NvInfer.h"
+
+namespace nmtSample
+{
+/** \class Projection
+ *
+ * \brief calculates raw logits
+ *
+ */
+class Projection : public Component
+{
+public:
+    typedef std::shared_ptr<Projection> ptr;
+
+    Projection() = default;
+
+    /**
+     * \brief add raw logits to the network
+     */
+    virtual void addToModel(
+        nvinfer1::INetworkDefinition* network, nvinfer1::ITensor* input, nvinfer1::ITensor** outputLogits)
+        = 0;
+
+    /**
+     * \brief get the size of raw logits vector
+     */
+    virtual int getOutputSize() = 0;
+
+    ~Projection() override = default;
+};
+} // namespace nmtSample
+
+#endif // SAMPLE_NMT_PROJECTION_
diff --git a/samples/opensource/sampleNMT/model/slpAttention.cpp b/samples/opensource/sampleNMT/model/slpAttention.cpp
new file mode 100644
index 00000000..7440e599
--- /dev/null
+++ b/samples/opensource/sampleNMT/model/slpAttention.cpp
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "slpAttention.h"
+
+#include <cassert>
+#include <sstream>
+
+namespace nmtSample
+{
+SLPAttention::SLPAttention(ComponentWeights::ptr weights)
+    : mWeights(weights)
+{
+    // please refer to chpt_to_bin.py for the details on the format
+    assert(mWeights->mMetaData.size() >= 3);
+    mKernelWeights.type = static_cast<nvinfer1::DataType>(mWeights->mMetaData[0]);
+    assert(mKernelWeights.type == nvinfer1::DataType::kFLOAT);
+    mInputChannelCount = mWeights->mMetaData[1];
+    mOutputChannelCount = mWeights->mMetaData[2];
+
+    mKernelWeights.values = (void*) (&mWeights->mWeights[0]);
+    mKernelWeights.count = mInputChannelCount * mOutputChannelCount;
+}
+
+void SLPAttention::addToModel(nvinfer1::INetworkDefinition* network, nvinfer1::ITensor* inputFromDecoder,
+    nvinfer1::ITensor* context, nvinfer1::ITensor** attentionOutput)
+{
+    nvinfer1::ITensor* inputTensors[] = {inputFromDecoder, context};
+    auto concatLayer = network->addConcatenation(inputTensors, 2);
+    assert(concatLayer != nullptr);
+    concatLayer->setName("Concatinate decoder output and context");
+    concatLayer->setAxis(1);
+    auto concatinatedTensor = concatLayer->getOutput(0);
+    assert(concatinatedTensor != nullptr);
+
+    nvinfer1::Dims weightDims{2, {mInputChannelCount, mOutputChannelCount},
+        {nvinfer1::DimensionType::kCHANNEL, nvinfer1::DimensionType::kCHANNEL}};
+    auto constLayer = network->addConstant(weightDims, mKernelWeights);
+    assert(constLayer != nullptr);
+    constLayer->setName("Attention Matrix");
+    auto weights = constLayer->getOutput(0);
+    assert(weights != nullptr);
+
+    auto mmLayer = network->addMatrixMultiply(*concatinatedTensor, false, *weights, false);
+    assert(mmLayer != nullptr);
+    mmLayer->setName("Attention Matrix Multiply");
+
+    auto actLayer = network->addActivation(*mmLayer->getOutput(0), nvinfer1::ActivationType::kTANH);
+    assert(actLayer != nullptr);
+
+    *attentionOutput = actLayer->getOutput(0);
+    assert(*attentionOutput != nullptr);
+}
+
+int SLPAttention::getAttentionSize()
+{
+    return mOutputChannelCount;
+}
+
+std::string SLPAttention::getInfo()
+{
+    std::stringstream ss;
+    ss << "SLP Attention, num inputs = " << mInputChannelCount << ", num outputs = " << mOutputChannelCount;
+    return ss.str();
+}
+} // namespace nmtSample
diff --git a/samples/opensource/sampleNMT/model/slpAttention.h b/samples/opensource/sampleNMT/model/slpAttention.h
new file mode 100644
index 00000000..078d8828
--- /dev/null
+++ b/samples/opensource/sampleNMT/model/slpAttention.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SAMPLE_NMT_SLP_ATTENTION_
+#define SAMPLE_NMT_SLP_ATTENTION_
+
+#include "attention.h"
+
+#include "componentWeights.h"
+
+namespace nmtSample
+{
+/** \class SLPAttention
+ *
+ * \brief Linear attention calculation
+ *
+ * Calculates attention vector by concatinating input from the decoder with context vector
+ * and projecting the result into attention space by multiplying with weight matrix
+ *
+ */
+class SLPAttention : public Attention
+{
+public:
+    SLPAttention(ComponentWeights::ptr weights);
+
+    void addToModel(nvinfer1::INetworkDefinition* network, nvinfer1::ITensor* inputFromDecoder,
+        nvinfer1::ITensor* context, nvinfer1::ITensor** attentionOutput) override;
+
+    int getAttentionSize() override;
+
+    std::string getInfo() override;
+
+protected:
+    ComponentWeights::ptr mWeights;
+    nvinfer1::Weights mKernelWeights;
+    int mInputChannelCount;
+    int mOutputChannelCount;
+};
+} // namespace nmtSample
+
+#endif // SAMPLE_NMT_SLP_ATTENTION_
diff --git a/samples/opensource/sampleNMT/model/slpEmbedder.cpp b/samples/opensource/sampleNMT/model/slpEmbedder.cpp
new file mode 100644
index 00000000..5f3224da
--- /dev/null
+++ b/samples/opensource/sampleNMT/model/slpEmbedder.cpp
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "slpEmbedder.h"
+#include "common.h"
+
+#include <cassert>
+#include <sstream>
+
+namespace nmtSample
+{
+SLPEmbedder::SLPEmbedder(ComponentWeights::ptr weights)
+    : mWeights(weights)
+{
+    // please refer to chpt_to_bin.py for the details on the format
+    assert(mWeights->mMetaData.size() >= 3);
+    mKernelWeights.type = static_cast<nvinfer1::DataType>(mWeights->mMetaData[0]);
+    assert(mKernelWeights.type == nvinfer1::DataType::kFLOAT);
+    // Resize dimensions to be multiples of gPadMultiple for performance
+    mNumInputs = samplesCommon::roundUp(mWeights->mMetaData[1], gPadMultiple);  // matches projection output channels
+    mNumOutputs = samplesCommon::roundUp(mWeights->mMetaData[2], gPadMultiple); // matches projection input channels
+    mResizedKernelWeights = resizeWeights(
+        mWeights->mMetaData[1], mWeights->mMetaData[2], mNumInputs, mNumOutputs, (const float*) &mWeights->mWeights[0]);
+    mKernelWeights.values = mResizedKernelWeights.data();
+    mKernelWeights.count = mNumInputs * mNumOutputs;
+}
+
+void SLPEmbedder::addToModel(
+    nvinfer1::INetworkDefinition* network, nvinfer1::ITensor* input, nvinfer1::ITensor** output)
+{
+    nvinfer1::Dims weightDims{
+        2, {mNumInputs, mNumOutputs}, {nvinfer1::DimensionType::kCHANNEL, nvinfer1::DimensionType::kCHANNEL}};
+    auto constLayer = network->addConstant(weightDims, mKernelWeights);
+    assert(constLayer != nullptr);
+    constLayer->setName("Embedding matrix");
+    auto weights = constLayer->getOutput(0);
+    assert(weights != nullptr);
+
+    auto gatherLayer = network->addGather(*weights, *input, 0);
+    assert(gatherLayer != nullptr);
+    gatherLayer->setName("Gather in embedding");
+    *output = gatherLayer->getOutput(0);
+    assert(*output != nullptr);
+}
+
+int SLPEmbedder::getInputDimensionSize()
+{
+    return mNumInputs;
+}
+
+std::string SLPEmbedder::getInfo()
+{
+    std::stringstream ss;
+    ss << "SLP Embedder, num inputs = " << mNumInputs << ", num outputs = " << mNumOutputs;
+    return ss.str();
+}
+} // namespace nmtSample
diff --git a/samples/opensource/sampleNMT/model/slpEmbedder.h b/samples/opensource/sampleNMT/model/slpEmbedder.h
new file mode 100644
index 00000000..700d1f2b
--- /dev/null
+++ b/samples/opensource/sampleNMT/model/slpEmbedder.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SAMPLE_NMT_SLP_EMBEDDER_
+#define SAMPLE_NMT_SLP_EMBEDDER_
+
+#include "embedder.h"
+#include "trtUtil.h"
+
+#include "componentWeights.h"
+
+#include "NvInfer.h"
+
+extern int gPadMultiple;
+
+namespace nmtSample
+{
+/** \class SLPEmbedder
+ *
+ * \brief selects the embedding vector from the weight matrix using index provided in the input
+ *
+ */
+class SLPEmbedder : public Embedder
+{
+public:
+    SLPEmbedder(ComponentWeights::ptr weights);
+
+    void addToModel(
+        nvinfer1::INetworkDefinition* network, nvinfer1::ITensor* input, nvinfer1::ITensor** output) override;
+
+    int getInputDimensionSize() override;
+
+    std::string getInfo() override;
+
+    ~SLPEmbedder() override = default;
+
+protected:
+    ComponentWeights::ptr mWeights;
+    nvinfer1::Weights mKernelWeights;
+    int mNumInputs;
+    int mNumOutputs;
+    std::vector<float> mResizedKernelWeights;
+};
+} // namespace nmtSample
+
+#endif // SAMPLE_NMT_SLP_EMBEDDER_
diff --git a/samples/opensource/sampleNMT/model/slpProjection.cpp b/samples/opensource/sampleNMT/model/slpProjection.cpp
new file mode 100644
index 00000000..25ed1e71
--- /dev/null
+++ b/samples/opensource/sampleNMT/model/slpProjection.cpp
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "slpProjection.h"
+#include "common.h"
+
+#include <cassert>
+#include <sstream>
+
+namespace nmtSample
+{
+SLPProjection::SLPProjection(ComponentWeights::ptr weights)
+    : mWeights(weights)
+{
+    // please refer to chpt_to_bin.py for the details on the format
+    assert(mWeights->mMetaData.size() >= 3);
+    mKernelWeights.type = static_cast<nvinfer1::DataType>(mWeights->mMetaData[0]);
+    assert(mKernelWeights.type == nvinfer1::DataType::kFLOAT);
+    // Resize dimensions to be multiples of gPadMultiple for performance
+    mInputChannelCount = samplesCommon::roundUp(mWeights->mMetaData[1], gPadMultiple);  // matches embedder outputs
+    mOutputChannelCount = samplesCommon::roundUp(mWeights->mMetaData[2], gPadMultiple); // matches embedder inputs
+    mResizedKernelWeights = resizeWeights(mWeights->mMetaData[1], mWeights->mMetaData[2], mInputChannelCount,
+        mOutputChannelCount, (const float*) &mWeights->mWeights[0]);
+    mKernelWeights.values = mResizedKernelWeights.data();
+    mKernelWeights.count = mInputChannelCount * mOutputChannelCount;
+}
+
+void SLPProjection::addToModel(
+    nvinfer1::INetworkDefinition* network, nvinfer1::ITensor* input, nvinfer1::ITensor** outputLogits)
+{
+    nvinfer1::Dims weightDims{2, {mInputChannelCount, mOutputChannelCount},
+        {nvinfer1::DimensionType::kCHANNEL, nvinfer1::DimensionType::kCHANNEL}};
+    auto constLayer = network->addConstant(weightDims, mKernelWeights);
+    assert(constLayer != nullptr);
+    constLayer->setName("Projection matrix");
+    auto weights = constLayer->getOutput(0);
+    assert(weights != nullptr);
+
+    auto mmLayer = network->addMatrixMultiply(*input, false, *weights, false);
+    assert(mmLayer != nullptr);
+    mmLayer->setName("Projection Matrix Multiply");
+    *outputLogits = mmLayer->getOutput(0);
+    assert(*outputLogits != nullptr);
+}
+
+int SLPProjection::getOutputSize()
+{
+    return mOutputChannelCount;
+}
+
+std::string SLPProjection::getInfo()
+{
+    std::stringstream ss;
+    ss << "SLP Projection, num inputs = " << mInputChannelCount << ", num outputs = " << mOutputChannelCount;
+    return ss.str();
+}
+} // namespace nmtSample
diff --git a/samples/opensource/sampleNMT/model/slpProjection.h b/samples/opensource/sampleNMT/model/slpProjection.h
new file mode 100644
index 00000000..27c7586b
--- /dev/null
+++ b/samples/opensource/sampleNMT/model/slpProjection.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SAMPLE_NMT_SLP_PROJECTION_
+#define SAMPLE_NMT_SLP_PROJECTION_
+
+#include "projection.h"
+#include "trtUtil.h"
+
+#include "componentWeights.h"
+
+extern int gPadMultiple;
+
+namespace nmtSample
+{
+/** \class SLPProjection
+ *
+ * \brief Linear logits calculation
+ *
+ * Calculates logits vector by multiplying input vector with weight matrix
+ *
+ */
+class SLPProjection : public Projection
+{
+public:
+    SLPProjection(ComponentWeights::ptr weights);
+
+    void addToModel(
+        nvinfer1::INetworkDefinition* network, nvinfer1::ITensor* input, nvinfer1::ITensor** outputLogits) override;
+
+    int getOutputSize() override;
+
+    std::string getInfo() override;
+
+    ~SLPProjection() override = default;
+
+protected:
+    ComponentWeights::ptr mWeights;
+    nvinfer1::Weights mKernelWeights;
+    int mInputChannelCount;
+    int mOutputChannelCount;
+    std::vector<float> mResizedKernelWeights;
+};
+} // namespace nmtSample
+
+#endif // SAMPLE_NMT_SLP_PROJECTION_
diff --git a/samples/opensource/sampleNMT/model/softmaxLikelihood.cpp b/samples/opensource/sampleNMT/model/softmaxLikelihood.cpp
new file mode 100644
index 00000000..6d16fe11
--- /dev/null
+++ b/samples/opensource/sampleNMT/model/softmaxLikelihood.cpp
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "softmaxLikelihood.h"
+
+#include <cassert>
+
+#include <math.h>
+
+namespace nmtSample
+{
+void SoftmaxLikelihood::addToModel(nvinfer1::INetworkDefinition* network, int beamWidth, nvinfer1::ITensor* inputLogits,
+    nvinfer1::ITensor* inputLikelihoods, nvinfer1::ITensor** newCombinedLikelihoods,
+    nvinfer1::ITensor** newRayOptionIndices, nvinfer1::ITensor** newVocabularyIndices)
+{
+    auto softmaxLayer = network->addSoftMax(*inputLogits);
+    assert(softmaxLayer != nullptr);
+    softmaxLayer->setName("Softmax in likelihood calculation");
+    softmaxLayer->setAxes(2);
+    auto softmaxTensor = softmaxLayer->getOutput(0);
+    assert(softmaxTensor != nullptr);
+
+    auto topKLayer = network->addTopK(*softmaxTensor, nvinfer1::TopKOperation::kMAX, beamWidth, 2);
+    assert(topKLayer != nullptr);
+    topKLayer->setName("TopK 1st in likelihood calculation");
+    auto newLikelihoods = topKLayer->getOutput(0);
+    assert(newLikelihoods != nullptr);
+    auto vocabularyIndices = topKLayer->getOutput(1);
+    assert(vocabularyIndices != nullptr);
+
+    auto eltWiseLayer
+        = network->addElementWise(*newLikelihoods, *inputLikelihoods, nvinfer1::ElementWiseOperation::kPROD);
+    assert(eltWiseLayer != nullptr);
+    eltWiseLayer->setName("EltWise multiplication in likelihood calculation");
+    auto combinedLikelihoods = eltWiseLayer->getOutput(0);
+    assert(combinedLikelihoods != nullptr);
+
+    auto shuffleLayer = network->addShuffle(*combinedLikelihoods);
+    assert(shuffleLayer != nullptr);
+    shuffleLayer->setName("Reshape combined likelihoods");
+    nvinfer1::Dims shuffleDims{1, {beamWidth * beamWidth}, {nvinfer1::DimensionType::kCHANNEL}};
+    shuffleLayer->setReshapeDimensions(shuffleDims);
+    auto reshapedCombinedLikelihoods = shuffleLayer->getOutput(0);
+    assert(reshapedCombinedLikelihoods != nullptr);
+
+    auto topKLayer2 = network->addTopK(*reshapedCombinedLikelihoods, nvinfer1::TopKOperation::kMAX, beamWidth, 1);
+    assert(topKLayer2 != nullptr);
+    topKLayer2->setName("TopK 2nd in likelihood calculation");
+    *newCombinedLikelihoods = topKLayer2->getOutput(0);
+    assert(*newCombinedLikelihoods != nullptr);
+    *newRayOptionIndices = topKLayer2->getOutput(1);
+    assert(*newRayOptionIndices != nullptr);
+
+    auto shuffleLayer2 = network->addShuffle(*vocabularyIndices);
+    assert(shuffleLayer2 != nullptr);
+    shuffleLayer2->setName("Reshape vocabulary indices");
+    nvinfer1::Dims shuffleDims2{1, {beamWidth * beamWidth}, {nvinfer1::DimensionType::kCHANNEL}};
+    shuffleLayer2->setReshapeDimensions(shuffleDims2);
+    auto reshapedVocabularyIndices = shuffleLayer2->getOutput(0);
+    assert(reshapedVocabularyIndices != nullptr);
+
+    auto gatherLayer = network->addGather(*reshapedVocabularyIndices, **newRayOptionIndices, 0);
+    assert(gatherLayer != nullptr);
+    gatherLayer->setName("Shuffle vocabulary indices");
+    *newVocabularyIndices = gatherLayer->getOutput(0);
+    assert(*newVocabularyIndices != nullptr);
+}
+
+float SoftmaxLikelihood::SoftmaxLikelihoodCombinationOperator::combine(
+    float rayLikelihood, float optionLikelihood) const
+{
+    return rayLikelihood * optionLikelihood;
+}
+
+float SoftmaxLikelihood::SoftmaxLikelihoodCombinationOperator::init() const
+{
+    return 1.0F;
+}
+
+float SoftmaxLikelihood::SoftmaxLikelihoodCombinationOperator::smallerThanMinimalLikelihood() const
+{
+    return -1.0F;
+}
+
+LikelihoodCombinationOperator::ptr SoftmaxLikelihood::getLikelihoodCombinationOperator() const
+{
+    return std::make_shared<SoftmaxLikelihoodCombinationOperator>();
+}
+
+std::string SoftmaxLikelihood::getInfo()
+{
+    return "Softmax Likelihood";
+}
+} // namespace nmtSample
diff --git a/samples/opensource/sampleNMT/model/softmaxLikelihood.h b/samples/opensource/sampleNMT/model/softmaxLikelihood.h
new file mode 100644
index 00000000..75dd73dd
--- /dev/null
+++ b/samples/opensource/sampleNMT/model/softmaxLikelihood.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SAMPLE_NMT_SOFTMAX_LIKELIHOOD_
+#define SAMPLE_NMT_SOFTMAX_LIKELIHOOD_
+
+#include "NvInfer.h"
+#include "likelihood.h"
+
+namespace nmtSample
+{
+/** \class SoftmaxLikelihood
+ *
+ * \brief calculates softmax likelihood and TopK indices for the raw input logits
+ *
+ */
+class SoftmaxLikelihood : public Likelihood
+{
+private:
+    class SoftmaxLikelihoodCombinationOperator : public LikelihoodCombinationOperator
+    {
+    public:
+        SoftmaxLikelihoodCombinationOperator() = default;
+
+        float combine(float rayLikelihood, float optionLikelihood) const override;
+
+        float init() const override;
+
+        float smallerThanMinimalLikelihood() const override;
+
+        ~SoftmaxLikelihoodCombinationOperator() override = default;
+    };
+
+public:
+    SoftmaxLikelihood() = default;
+
+    LikelihoodCombinationOperator::ptr getLikelihoodCombinationOperator() const override;
+
+    void addToModel(nvinfer1::INetworkDefinition* network, int beamWidth, nvinfer1::ITensor* inputLogits,
+        nvinfer1::ITensor* inputLikelihoods, nvinfer1::ITensor** newCombinedLikelihoods,
+        nvinfer1::ITensor** newRayOptionIndices, nvinfer1::ITensor** newVocabularyIndices) override;
+
+    std::string getInfo() override;
+
+    ~SoftmaxLikelihood() override = default;
+};
+} // namespace nmtSample
+
+#endif // SAMPLE_NMT_SOFTMAX_LIKELIHOOD_
diff --git a/samples/opensource/sampleNMT/pinnedHostBuffer.h b/samples/opensource/sampleNMT/pinnedHostBuffer.h
new file mode 100644
index 00000000..8f4f1045
--- /dev/null
+++ b/samples/opensource/sampleNMT/pinnedHostBuffer.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SAMPLE_NMT_PINNED_HOST_BUFFER_
+#define SAMPLE_NMT_PINNED_HOST_BUFFER_
+
+#include "cudaError.h"
+#include <cuda_runtime_api.h>
+#include <memory>
+
+namespace nmtSample
+{
+/** \class PinnedHostBuffer
+ *
+ * \brief wrapper for the pinned host memory region
+ *
+ */
+template <typename T>
+class PinnedHostBuffer
+{
+public:
+    typedef std::shared_ptr<PinnedHostBuffer<T>> ptr;
+
+    PinnedHostBuffer(size_t elementCount)
+        : mBuffer(nullptr)
+    {
+        CUDA_CHECK(cudaHostAlloc(&mBuffer, elementCount * sizeof(T), cudaHostAllocDefault));
+    }
+
+    virtual ~PinnedHostBuffer()
+    {
+        if (mBuffer)
+        {
+            cudaFreeHost(mBuffer);
+        }
+    }
+
+    operator T*()
+    {
+        return mBuffer;
+    }
+
+    operator const T*() const
+    {
+        return mBuffer;
+    }
+
+protected:
+    T* mBuffer;
+};
+} // namespace nmtSample
+
+#endif // SAMPLE_NMT_PINNED_HOST_BUFFER_
diff --git a/samples/opensource/sampleNMT/sampleNMT.cpp b/samples/opensource/sampleNMT/sampleNMT.cpp
new file mode 100644
index 00000000..d7de1c76
--- /dev/null
+++ b/samples/opensource/sampleNMT/sampleNMT.cpp
@@ -0,0 +1,1302 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <algorithm>
+#include <chrono>
+#include <cstdio>
+#include <cstring>
+#include <cuda_runtime.h>
+#include <exception>
+#include <fstream>
+#include <future>
+#include <iomanip>
+#include <iostream>
+#include <map>
+#include <sstream>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "NvInfer.h"
+#include "argsParser.h"
+#include "common.h"
+#include "data/benchmarkWriter.h"
+#include "data/bleuScoreWriter.h"
+#include "data/dataReader.h"
+#include "data/dataWriter.h"
+#include "data/limitedSamplesDataReader.h"
+#include "data/sequenceProperties.h"
+#include "data/textReader.h"
+#include "data/textWriter.h"
+#include "data/vocabulary.h"
+#include "deviceBuffer.h"
+#include "logger.h"
+#include "model/alignment.h"
+#include "model/attention.h"
+#include "model/beamSearchPolicy.h"
+#include "model/componentWeights.h"
+#include "model/contextNMT.h"
+#include "model/debugUtil.h"
+#include "model/decoder.h"
+#include "model/embedder.h"
+#include "model/encoder.h"
+#include "model/likelihood.h"
+#include "model/lstmDecoder.h"
+#include "model/lstmEncoder.h"
+#include "model/multiplicativeAlignment.h"
+#include "model/projection.h"
+#include "model/slpAttention.h"
+#include "model/slpEmbedder.h"
+#include "model/slpProjection.h"
+#include "model/softmaxLikelihood.h"
+#include "pinnedHostBuffer.h"
+#include "trtUtil.h"
+
+bool gPrintComponentInfo = true;
+bool gFeedAttentionToInput = true;
+bool gInitializeDecoderFromEncoderHiddenStates = true;
+
+int gMaxBatchSize = 128;
+int gBeamWidth = 5;
+int gMaxInputSequenceLength = 150;
+int gMaxOutputSequenceLength = -1;
+int gMaxInferenceSamples = -1;
+std::string gDataWriterStr = "bleu";
+std::string gOutputTextFileName("translation_output.txt");
+int gMaxWorkspaceSize = 256_MiB;
+std::string gDataDirectory("data/samples/nmt/deen");
+bool gEnableProfiling = false;
+bool gAggregateProfiling = false;
+bool gFp16 = false;
+bool gVerbose = false;
+bool gInt8 = false;
+int gUseDLACore{-1};
+int gPadMultiple = 1;
+
+const std::string gSampleName = "TensorRT.sample_nmt";
+
+std::string gInputTextFileName("newstest2015.tok.bpe.32000.de");
+std::string gReferenceOutputTextFileName("newstest2015.tok.bpe.32000.en");
+std::string gInputVocabularyFileName("vocab.bpe.32000.de");
+std::string gOutputVocabularyFileName("vocab.bpe.32000.en");
+std::string gEncEmbedFileName("weights/encembed.bin");
+std::string gEncRnnFileName("weights/encrnn.bin");
+std::string gDecEmbedFileName("weights/decembed.bin");
+std::string gDecRnnFileName("weights/decrnn.bin");
+std::string gDecAttFileName("weights/decatt.bin");
+std::string gDecMemFileName("weights/decmem.bin");
+std::string gDecProjFileName("weights/decproj.bin");
+nmtSample::Vocabulary::ptr gOutputVocabulary = std::make_shared<nmtSample::Vocabulary>();
+
+std::string locateNMTFile(const std::string& fpathSuffix)
+{
+    std::vector<std::string> dirs{std::string(gDataDirectory) + "/", "data/nmt/deen/"};
+    return locateFile(fpathSuffix, dirs);
+}
+
+nmtSample::SequenceProperties::ptr getOutputSequenceProperties()
+{
+    return gOutputVocabulary;
+}
+
+nmtSample::DataReader::ptr getDataReader()
+{
+    std::shared_ptr<std::istream> textInput(new std::ifstream(locateNMTFile(gInputTextFileName)));
+    std::shared_ptr<std::istream> vocabInput(new std::ifstream(locateNMTFile(gInputVocabularyFileName)));
+    assert(textInput->good());
+    assert(vocabInput->good());
+
+    auto vocabulary = std::make_shared<nmtSample::Vocabulary>();
+    *vocabInput >> *vocabulary;
+
+    auto reader = std::make_shared<nmtSample::TextReader>(textInput, vocabulary);
+
+    if (gMaxInferenceSamples >= 0)
+        return std::make_shared<nmtSample::LimitedSamplesDataReader>(gMaxInferenceSamples, reader);
+    else
+        return reader;
+}
+
+template <typename Component>
+std::shared_ptr<Component> buildNMTComponentFromWeightsFile(const std::string& filename)
+{
+    auto weights = std::make_shared<nmtSample::ComponentWeights>();
+    std::ifstream input(locateNMTFile(filename), std::ios::binary);
+    assert(input.good());
+    input >> *weights;
+
+    return std::make_shared<Component>(weights);
+}
+
+nmtSample::Embedder::ptr getInputEmbedder()
+{
+    return buildNMTComponentFromWeightsFile<nmtSample::SLPEmbedder>(gEncEmbedFileName);
+}
+
+nmtSample::Embedder::ptr getOutputEmbedder()
+{
+    return buildNMTComponentFromWeightsFile<nmtSample::SLPEmbedder>(gDecEmbedFileName);
+}
+
+nmtSample::Encoder::ptr getEncoder()
+{
+    return buildNMTComponentFromWeightsFile<nmtSample::LSTMEncoder>(gEncRnnFileName);
+}
+
+nmtSample::Alignment::ptr getAlignment()
+{
+    return buildNMTComponentFromWeightsFile<nmtSample::MultiplicativeAlignment>(gDecMemFileName);
+}
+
+nmtSample::Context::ptr getContext()
+{
+    return std::make_shared<nmtSample::Context>();
+}
+
+nmtSample::Decoder::ptr getDecoder()
+{
+    return buildNMTComponentFromWeightsFile<nmtSample::LSTMDecoder>(gDecRnnFileName);
+}
+
+nmtSample::Attention::ptr getAttention()
+{
+    return buildNMTComponentFromWeightsFile<nmtSample::SLPAttention>(gDecAttFileName);
+}
+
+nmtSample::Projection::ptr getProjection()
+{
+    return buildNMTComponentFromWeightsFile<nmtSample::SLPProjection>(gDecProjFileName);
+}
+
+nmtSample::Likelihood::ptr getLikelihood()
+{
+    return std::make_shared<nmtSample::SoftmaxLikelihood>();
+}
+
+nmtSample::BeamSearchPolicy::ptr getSearchPolicy(
+    int endSequenceId, nmtSample::LikelihoodCombinationOperator::ptr likelihoodCombinationOperator)
+{
+    return std::make_shared<nmtSample::BeamSearchPolicy>(endSequenceId, likelihoodCombinationOperator, gBeamWidth);
+}
+
+nmtSample::DataWriter::ptr getDataWriter()
+{
+    if (gDataWriterStr == "bleu")
+    {
+        std::shared_ptr<std::istream> textInput(new std::ifstream(locateNMTFile(gReferenceOutputTextFileName)));
+        assert(textInput->good());
+        return std::make_shared<nmtSample::BLEUScoreWriter>(textInput, gOutputVocabulary);
+    }
+    else if (gDataWriterStr == "text")
+    {
+        std::remove(gOutputTextFileName.data());
+        std::shared_ptr<std::ostream> textOutput(new std::ofstream(gOutputTextFileName));
+        assert(textOutput->good()
+            && "Please contact system administrator if you have no permission to write the file "
+               "translation_output.txt");
+        return std::make_shared<nmtSample::TextWriter>(textOutput, gOutputVocabulary);
+    }
+    else if (gDataWriterStr == "benchmark")
+    {
+        return std::make_shared<nmtSample::BenchmarkWriter>();
+    }
+    else
+    {
+        gLogError << "Invalid data writer specified: " << gDataWriterStr << std::endl;
+        assert(0);
+        return nmtSample::DataWriter::ptr();
+    }
+}
+
+bool parseString(const char* arg, const char* name, std::string& value)
+{
+    size_t n = strlen(name);
+    bool match = arg[0] == '-' && arg[1] == '-' && !strncmp(arg + 2, name, n) && arg[n + 2] == '=';
+    if (match)
+    {
+        value = arg + n + 3;
+        gLogInfo << name << ": " << value << std::endl;
+    }
+    return match;
+}
+
+bool parseInt(const char* arg, const char* name, int& value)
+{
+    size_t n = strlen(name);
+    bool match = arg[0] == '-' && arg[1] == '-' && !strncmp(arg + 2, name, n) && arg[n + 2] == '=';
+    if (match)
+    {
+        value = atoi(arg + n + 3);
+        gLogInfo << name << ": " << value << std::endl;
+    }
+    return match;
+}
+
+bool parseBool(const char* arg, const char* longName, bool& value, char shortName = 0)
+{
+    bool match = false;
+
+    if (shortName)
+    {
+        match = (arg[0] == '-') && (arg[1] == shortName);
+    }
+    if (!match && longName)
+    {
+        const size_t n = strlen(longName);
+        match = arg[0] == '-' && arg[1] == '-' && !strncmp(arg + 2, longName, n);
+    }
+    if (match)
+    {
+        gLogInfo << longName << ": true" << std::endl;
+        value = true;
+    }
+    return match;
+}
+
+void printUsage()
+{
+    printf("\nOptional params:\n");
+    printf("  --help, -h                           Output help message and exit\n");
+    printf("  --data_writer=bleu/text/benchmark    Type of the output the app generates (default = %s)\n",
+        gDataWriterStr.c_str());
+    printf("  --output_file=<path_to_file>         Path to the output file when data_writer=text (default = %s)\n",
+        gOutputTextFileName.c_str());
+    printf("  --batch=<N>                          Batch size (default = %d)\n", gMaxBatchSize);
+    printf("  --beam=<N>                           Beam width (default = %d)\n", gBeamWidth);
+    printf("  --max_input_sequence_length=<N>      Maximum length for input sequences (default = %d)\n",
+        gMaxInputSequenceLength);
+    printf(
+        "  --max_output_sequence_length=<N>     Maximum length for output sequences (default = %d), negative value "
+        "indicates no limit\n",
+        gMaxOutputSequenceLength);
+    printf(
+        "  --max_inference_samples=<N>          Maximum sample count to run inference for, negative values indicates "
+        "no limit is set (default = %d)\n",
+        gMaxInferenceSamples);
+    printf("  --verbose                            Output verbose-level messages by TensorRT\n");
+    printf("  --max_workspace_size=<N>             Maximum workspace size (default = %d)\n", gMaxWorkspaceSize);
+    printf(
+        "  --data_dir=<path_to_data_directory>  Path to the directory where data and weights are located (default = "
+        "%s)\n",
+        gDataDirectory.c_str());
+    printf(
+        "  --profile                            Profile TensorRT execution layer by layer. Use benchmark data_writer "
+        "when profiling on, disregard benchmark results\n");
+    printf("  --aggregate_profile                  Merge profiles from multiple TensorRT engines\n");
+    printf("  --fp16                               Switch on fp16 math\n");
+    printf("  --int8                               Switch on int8 math\n");
+    printf(
+        "  --useDLACore=N                       Specify a DLA engine for layers that support DLA. Value can range from "
+        "0 to n-1, where n is the number of DLA engines on the platform.\n");
+    printf(
+        "  --padMultiple=N                      Specify multiple to pad out matrix dimensions to test performance\n");
+}
+
+bool parseNMTArgs(samplesCommon::Args& args, int argc, char* argv[])
+{
+    if (argc < 1)
+    {
+        printUsage();
+        return false;
+    }
+
+    bool showHelp = false;
+    for (int j = 1; j < argc; j++)
+    {
+        if (parseBool(argv[j], "help", showHelp, 'h'))
+            continue;
+        if (parseString(argv[j], "data_writer", gDataWriterStr))
+            continue;
+        if (parseString(argv[j], "output_file", gOutputTextFileName))
+            continue;
+        if (parseInt(argv[j], "batch", gMaxBatchSize))
+            continue;
+        if (parseInt(argv[j], "beam", gBeamWidth))
+            continue;
+        if (parseInt(argv[j], "max_input_sequence_length", gMaxInputSequenceLength))
+            continue;
+        if (parseInt(argv[j], "max_output_sequence_length", gMaxOutputSequenceLength))
+            continue;
+        if (parseInt(argv[j], "max_inference_samples", gMaxInferenceSamples))
+            continue;
+        if (parseBool(argv[j], "verbose", gVerbose))
+            continue;
+        if (parseInt(argv[j], "max_workspace_size", gMaxWorkspaceSize))
+            continue;
+        if (parseString(argv[j], "data_dir", gDataDirectory))
+            continue;
+        if (parseBool(argv[j], "profile", gEnableProfiling))
+            continue;
+        if (parseBool(argv[j], "aggregate_profile", gAggregateProfiling))
+            continue;
+        if (parseBool(argv[j], "fp16", gFp16))
+            continue;
+        if (parseBool(argv[j], "int8", gInt8))
+            continue;
+        if (parseInt(argv[j], "useDLACore", gUseDLACore))
+            continue;
+        if (parseInt(argv[j], "padMultiple", gPadMultiple))
+            continue;
+    }
+
+    if (showHelp)
+    {
+        printUsage();
+        args.help = true;
+        return false;
+    }
+
+    return true;
+}
+
+nvinfer1::ICudaEngine* getEncoderEngine(
+    nmtSample::Embedder::ptr inputEmbedder, nmtSample::Encoder::ptr encoder, nmtSample::Alignment::ptr alignment)
+{
+    nvinfer1::IBuilder* encoderBuilder = nvinfer1::createInferBuilder(gLogger.getTRTLogger());
+    assert(encoderBuilder != nullptr);
+    nvinfer1::IBuilderConfig* encoderConfig = encoderBuilder->createBuilderConfig();
+    encoderBuilder->setMaxBatchSize(gMaxBatchSize);
+    encoderConfig->setMaxWorkspaceSize(gMaxWorkspaceSize);
+    if (gFp16)
+    {
+        encoderConfig->setFlag(BuilderFlag::kFP16);
+    }
+    if (gInt8)
+    {
+        encoderConfig->setFlag(BuilderFlag::kINT8);
+    }
+
+    nvinfer1::INetworkDefinition* encoderNetwork = encoderBuilder->createNetwork();
+
+    // Define inputs for the encoder
+    nvinfer1::Dims inputDims{1, {gMaxInputSequenceLength}, {nvinfer1::DimensionType::kSEQUENCE}};
+    auto inputEncoderDataTensor = encoderNetwork->addInput("input_encoder_data", nvinfer1::DataType::kINT32, inputDims);
+    assert(inputEncoderDataTensor != nullptr);
+    nvinfer1::Dims inputSequenceLengthsDims{0, {}, {}};
+    auto actualInputSequenceLengthsTensor = encoderNetwork->addInput(
+        "actual_input_sequence_lengths", nvinfer1::DataType::kINT32, inputSequenceLengthsDims);
+    assert(actualInputSequenceLengthsTensor != nullptr);
+    nvinfer1::Dims inputSequenceLengthsWithUnitIndexDims{1, {1}, {nvinfer1::DimensionType::kINDEX}};
+    auto actualInputSequenceLengthsWithUnitIndexTensor
+        = encoderNetwork->addInput("actual_input_sequence_lengths_with_index_dim", nvinfer1::DataType::kINT32,
+            inputSequenceLengthsWithUnitIndexDims);
+    assert(actualInputSequenceLengthsWithUnitIndexTensor != nullptr);
+
+    auto stateSizes = encoder->getStateSizes();
+    std::vector<nvinfer1::ITensor*> encoderInputStatesTensors(stateSizes.size());
+    for (int i = 0; i < static_cast<int>(stateSizes.size()); ++i)
+    {
+        std::stringstream ss;
+        ss << "input_encoder_states_" << i;
+        encoderInputStatesTensors[i] = encoderNetwork->addInput(
+            ss.str().c_str(), gFp16 ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT, stateSizes[i]);
+        assert(encoderInputStatesTensors[i] != nullptr);
+    }
+
+    nvinfer1::ITensor* initializeDecoderIndicesTensor = nullptr;
+    if (gInitializeDecoderFromEncoderHiddenStates)
+    {
+        nvinfer1::Dims inputDims{1, {gBeamWidth}, {nvinfer1::DimensionType::kINDEX}};
+        initializeDecoderIndicesTensor
+            = encoderNetwork->addInput("initialize_decoder_indices", nvinfer1::DataType::kINT32, inputDims);
+        assert(initializeDecoderIndicesTensor != nullptr);
+    }
+
+    nvinfer1::ITensor* inputEncoderEmbeddedTensor;
+    inputEmbedder->addToModel(encoderNetwork, inputEncoderDataTensor, &inputEncoderEmbeddedTensor);
+    inputEncoderEmbeddedTensor->setName("input_data_embedded");
+
+    nvinfer1::ITensor* memoryStatesTensor;
+    std::vector<nvinfer1::ITensor*> encoderOutputStatesTensors(stateSizes.size());
+    encoder->addToModel(encoderNetwork, gMaxInputSequenceLength, inputEncoderEmbeddedTensor,
+        actualInputSequenceLengthsTensor, &encoderInputStatesTensors[0], &memoryStatesTensor,
+        gInitializeDecoderFromEncoderHiddenStates ? &encoderOutputStatesTensors[0] : nullptr);
+    memoryStatesTensor->setName("memory_states");
+    encoderNetwork->markOutput(*memoryStatesTensor);
+    memoryStatesTensor->setType(gFp16 ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT);
+
+    if (alignment->getAttentionKeySize() > 0)
+    {
+        nvinfer1::ITensor* attentionKeysTensor;
+        alignment->addAttentionKeys(encoderNetwork, memoryStatesTensor, &attentionKeysTensor);
+        attentionKeysTensor->setName("attention_keys");
+        encoderNetwork->markOutput(*attentionKeysTensor);
+        attentionKeysTensor->setType(gFp16 ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT);
+    }
+
+    // Replicate sequence lengths for the decoder
+    {
+        auto gatherLayer = encoderNetwork->addGather(
+            *actualInputSequenceLengthsWithUnitIndexTensor, *initializeDecoderIndicesTensor, 0);
+        assert(gatherLayer != nullptr);
+        gatherLayer->setName("Replicate input sequence lengths for decoder");
+        auto actualInputSequenceLengthsReplicatedTensor = gatherLayer->getOutput(0);
+        assert(actualInputSequenceLengthsReplicatedTensor != nullptr);
+        actualInputSequenceLengthsReplicatedTensor->setName("actual_input_sequence_lengths_replicated");
+        encoderNetwork->markOutput(*actualInputSequenceLengthsReplicatedTensor);
+        actualInputSequenceLengthsReplicatedTensor->setType(nvinfer1::DataType::kINT32);
+    }
+
+    if (gInitializeDecoderFromEncoderHiddenStates)
+    {
+        for (int i = 0; i < static_cast<int>(stateSizes.size()); ++i)
+        {
+            assert(encoderOutputStatesTensors[i] != nullptr);
+
+            // Insert index (Z=1) dimension into tensor
+            nvinfer1::ITensor* encoderOutputStatesTensorWithUnitIndex;
+            {
+                auto shuffleLayer = encoderNetwork->addShuffle(*encoderOutputStatesTensors[i]);
+                assert(shuffleLayer != nullptr);
+                {
+                    std::stringstream ss;
+                    ss << "Reshape encoder states for decoder initialization " << i;
+                    shuffleLayer->setName(ss.str().c_str());
+                }
+                nvinfer1::Dims shuffleDims;
+                {
+                    shuffleDims.nbDims = stateSizes[i].nbDims + 1;
+                    shuffleDims.d[0] = 1;
+                    shuffleDims.type[0] = nvinfer1::DimensionType::kINDEX;
+                    for (int j = 0; j < stateSizes[i].nbDims; ++j)
+                    {
+                        shuffleDims.d[j + 1] = stateSizes[i].d[j];
+                        shuffleDims.type[j + 1] = stateSizes[i].type[j];
+                    }
+                }
+                shuffleLayer->setReshapeDimensions(shuffleDims);
+                encoderOutputStatesTensorWithUnitIndex = shuffleLayer->getOutput(0);
+                assert(encoderOutputStatesTensorWithUnitIndex != nullptr);
+            }
+            auto gatherLayer = encoderNetwork->addGather(
+                *encoderOutputStatesTensorWithUnitIndex, *initializeDecoderIndicesTensor, 0);
+            assert(gatherLayer != nullptr);
+            {
+                std::stringstream ss;
+                ss << "Replicate encoder states for decoder initialization " << i;
+                gatherLayer->setName(ss.str().c_str());
+            }
+            auto inputDecoderHiddenStatesTensor = gatherLayer->getOutput(0);
+            assert(inputDecoderHiddenStatesTensor != nullptr);
+            std::stringstream ss;
+            ss << "input_decoder_states_" << i;
+            inputDecoderHiddenStatesTensor->setName(ss.str().c_str());
+            encoderNetwork->markOutput(*inputDecoderHiddenStatesTensor);
+            inputDecoderHiddenStatesTensor->setType(gFp16 ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT);
+        }
+    }
+
+    samplesCommon::setDummyInt8Scales(encoderConfig, encoderNetwork);
+    samplesCommon::enableDLA(encoderBuilder, encoderConfig, gUseDLACore);
+    auto res = encoderBuilder->buildEngineWithConfig(*encoderNetwork, *encoderConfig);
+    encoderNetwork->destroy();
+    encoderBuilder->destroy();
+    encoderConfig->destroy();
+    return res;
+}
+
+nvinfer1::ICudaEngine* getGeneratorEngine(nmtSample::Embedder::ptr outputEmbedder, nmtSample::Decoder::ptr decoder,
+    nmtSample::Alignment::ptr alignment, nmtSample::Context::ptr context, nmtSample::Attention::ptr attention,
+    nmtSample::Projection::ptr projection, nmtSample::Likelihood::ptr likelihood)
+{
+    nvinfer1::IBuilder* generatorBuilder = nvinfer1::createInferBuilder(gLogger.getTRTLogger());
+    assert(generatorBuilder != nullptr);
+    nvinfer1::IBuilderConfig* generatorConfig = generatorBuilder->createBuilderConfig();
+    generatorBuilder->setMaxBatchSize(gMaxBatchSize);
+    generatorConfig->setMaxWorkspaceSize(gMaxWorkspaceSize);
+    if (gFp16)
+    {
+        generatorConfig->setFlag(BuilderFlag::kFP16);
+    }
+    if (gInt8)
+    {
+        generatorConfig->setFlag(BuilderFlag::kINT8);
+    }
+
+    nvinfer1::INetworkDefinition* generatorNetwork = generatorBuilder->createNetwork();
+
+    // Define inputs for the generator
+    auto stateSizes = decoder->getStateSizes();
+    std::vector<nvinfer1::ITensor*> decoderInputStatesTensors(stateSizes.size());
+    for (int i = 0; i < static_cast<int>(stateSizes.size()); ++i)
+    {
+        std::stringstream ss;
+        ss << "input_decoder_states_" << i;
+        nvinfer1::Dims statesDims;
+        {
+            statesDims.nbDims = stateSizes[i].nbDims + 1;
+            statesDims.d[0] = gBeamWidth;
+            statesDims.type[0] = nvinfer1::DimensionType::kINDEX;
+            for (int j = 0; j < stateSizes[i].nbDims; ++j)
+            {
+                statesDims.d[j + 1] = stateSizes[i].d[j];
+                statesDims.type[j + 1] = stateSizes[i].type[j];
+            }
+        }
+        decoderInputStatesTensors[i] = generatorNetwork->addInput(
+            ss.str().c_str(), gFp16 ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT, statesDims);
+        assert(decoderInputStatesTensors[i] != nullptr);
+    }
+    nvinfer1::Dims inputDecoderDataDims{1, {gBeamWidth}, {nvinfer1::DimensionType::kINDEX}};
+    auto inputDecoderDataTensor
+        = generatorNetwork->addInput("input_decoder_data", nvinfer1::DataType::kINT32, inputDecoderDataDims);
+    assert(inputDecoderDataTensor != nullptr);
+    nvinfer1::Dims inputSequenceLengthsTeplicatedDims{
+        2, {gBeamWidth, 1}, {nvinfer1::DimensionType::kINDEX, nvinfer1::DimensionType::kCHANNEL}};
+    auto actualInputSequenceLengthsReplicatedTensor = generatorNetwork->addInput(
+        "actual_input_sequence_lengths_replicated", nvinfer1::DataType::kINT32, inputSequenceLengthsTeplicatedDims);
+    assert(actualInputSequenceLengthsReplicatedTensor != nullptr);
+    nvinfer1::Dims memoryStatesDims{2, {gMaxInputSequenceLength, alignment->getSourceStatesSize()},
+        {nvinfer1::DimensionType::kSEQUENCE, nvinfer1::DimensionType::kCHANNEL}};
+    auto memoryStatesTensor = generatorNetwork->addInput(
+        "memory_states", gFp16 ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT, memoryStatesDims);
+    assert(memoryStatesTensor != nullptr);
+    nvinfer1::ITensor* attentionKeysTensor = nullptr;
+    if (alignment->getAttentionKeySize() > 0)
+    {
+        nvinfer1::Dims attentionKeysDims{2, {gMaxInputSequenceLength, alignment->getAttentionKeySize()},
+            {nvinfer1::DimensionType::kSEQUENCE, nvinfer1::DimensionType::kCHANNEL}};
+        attentionKeysTensor = generatorNetwork->addInput(
+            "attention_keys", gFp16 ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT, attentionKeysDims);
+        assert(attentionKeysTensor != nullptr);
+    }
+    nvinfer1::ITensor* inputAttentionTensor = nullptr;
+    if (gFeedAttentionToInput)
+    {
+        nvinfer1::Dims inputAttentionDims{2, {gBeamWidth, attention->getAttentionSize()},
+            {nvinfer1::DimensionType::kINDEX, nvinfer1::DimensionType::kCHANNEL}};
+        inputAttentionTensor = generatorNetwork->addInput(
+            "input_attention", gFp16 ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT, inputAttentionDims);
+        assert(inputAttentionTensor != nullptr);
+    }
+    nvinfer1::Dims inputLikelihoodsDims{
+        2, {gBeamWidth, 1}, {nvinfer1::DimensionType::kINDEX, nvinfer1::DimensionType::kCHANNEL}};
+    auto inputLikelihoodsTensor
+        = generatorNetwork->addInput("input_likelihoods", nvinfer1::DataType::kFLOAT, inputLikelihoodsDims);
+    assert(inputLikelihoodsTensor != nullptr);
+    nvinfer1::Dims inputLikelihoodsReplicateIndicesDims{1, {gBeamWidth}, {nvinfer1::DimensionType::kCHANNEL}};
+    auto inputLikelihoodsReplicateIndicesTensor = generatorNetwork->addInput(
+        "replicate_likelihoods_indices", nvinfer1::DataType::kINT32, inputLikelihoodsReplicateIndicesDims);
+    assert(inputLikelihoodsReplicateIndicesTensor != nullptr);
+
+    // Add output embedder
+    nvinfer1::ITensor* inputDecoderEmbeddedTensor;
+    outputEmbedder->addToModel(generatorNetwork, inputDecoderDataTensor, &inputDecoderEmbeddedTensor);
+    assert(inputDecoderEmbeddedTensor != nullptr);
+
+    // Add concatination of previous attention vector and embedded input for the decoder
+    nvinfer1::ITensor* inputDecoderEmbeddedConcatinatedWithAttentionTensor{nullptr};
+    if (gFeedAttentionToInput)
+    {
+        nvinfer1::ITensor* inputTensors[] = {inputDecoderEmbeddedTensor, inputAttentionTensor};
+        auto concatLayer = generatorNetwork->addConcatenation(inputTensors, 2);
+        assert(concatLayer != nullptr);
+        concatLayer->setName("Concatenate embedded input and attention");
+        concatLayer->setAxis(1);
+        inputDecoderEmbeddedConcatinatedWithAttentionTensor = concatLayer->getOutput(0);
+        assert(inputDecoderEmbeddedConcatinatedWithAttentionTensor != nullptr);
+    }
+
+    // Add decoder (single timestep)
+    nvinfer1::ITensor* outputDecoderDataTensor{nullptr};
+    std::vector<nvinfer1::ITensor*> decoderOutputStatesTensors(stateSizes.size());
+    decoder->addToModel(generatorNetwork,
+        gFeedAttentionToInput ? inputDecoderEmbeddedConcatinatedWithAttentionTensor : inputDecoderEmbeddedTensor,
+        &decoderInputStatesTensors[0], &outputDecoderDataTensor, &decoderOutputStatesTensors[0]);
+    for (int i = 0; i < static_cast<int>(stateSizes.size()); ++i)
+    {
+        std::stringstream ss;
+        ss << "output_decoder_states_" << i;
+        decoderOutputStatesTensors[i]->setName(ss.str().c_str());
+        generatorNetwork->markOutput(*decoderOutputStatesTensors[i]);
+        decoderOutputStatesTensors[i]->setType(gFp16 ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT);
+    }
+
+    // Add alignment scores
+    nvinfer1::ITensor* alignmentScoresTensor;
+    alignment->addToModel(generatorNetwork,
+        (alignment->getAttentionKeySize() > 0) ? attentionKeysTensor : memoryStatesTensor, outputDecoderDataTensor,
+        &alignmentScoresTensor);
+
+    // Add context
+    nvinfer1::ITensor* contextTensor;
+    context->addToModel(generatorNetwork, actualInputSequenceLengthsReplicatedTensor, memoryStatesTensor,
+        alignmentScoresTensor, &contextTensor);
+
+    // Add attention
+    nvinfer1::ITensor* attentionTensor;
+    attention->addToModel(generatorNetwork, outputDecoderDataTensor, contextTensor, &attentionTensor);
+    if (gFeedAttentionToInput)
+    {
+        attentionTensor->setName("output_attention");
+        generatorNetwork->markOutput(*attentionTensor);
+        attentionTensor->setType(gFp16 ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT);
+    }
+
+    // Add projection
+    nvinfer1::ITensor* logitsTensor;
+    projection->addToModel(generatorNetwork, attentionTensor, &logitsTensor);
+
+    // Replicate input likelihoods across all TopK options
+    auto gatherLayer = generatorNetwork->addGather(*inputLikelihoodsTensor, *inputLikelihoodsReplicateIndicesTensor, 1);
+    assert(gatherLayer != nullptr);
+    gatherLayer->setName("Replicate beam likelihoods");
+    auto inputLikelihoodsReplicatedTensor = gatherLayer->getOutput(0);
+    assert(inputLikelihoodsReplicatedTensor != nullptr);
+
+    // Add per-ray top-k options generation
+    nvinfer1::ITensor* outputCombinedLikelihoodsTensor;
+    nvinfer1::ITensor* outputRayOptionIndicesTensor;
+    nvinfer1::ITensor* outputVocabularyIndicesTensor;
+    likelihood->addToModel(generatorNetwork, gBeamWidth, logitsTensor, inputLikelihoodsReplicatedTensor,
+        &outputCombinedLikelihoodsTensor, &outputRayOptionIndicesTensor, &outputVocabularyIndicesTensor);
+    outputCombinedLikelihoodsTensor->setName("output_combined_likelihoods");
+    generatorNetwork->markOutput(*outputCombinedLikelihoodsTensor);
+    outputRayOptionIndicesTensor->setName("output_ray_option_indices");
+    generatorNetwork->markOutput(*outputRayOptionIndicesTensor);
+    outputRayOptionIndicesTensor->setType(nvinfer1::DataType::kINT32);
+    outputVocabularyIndicesTensor->setName("output_vocabulary_indices");
+    generatorNetwork->markOutput(*outputVocabularyIndicesTensor);
+    outputVocabularyIndicesTensor->setType(nvinfer1::DataType::kINT32);
+
+    samplesCommon::setDummyInt8Scales(generatorConfig, generatorNetwork);
+    samplesCommon::enableDLA(generatorBuilder, generatorConfig, gUseDLACore);
+    auto res = generatorBuilder->buildEngineWithConfig(*generatorNetwork, *generatorConfig);
+    generatorNetwork->destroy();
+    generatorBuilder->destroy();
+    generatorConfig->destroy();
+    return res;
+}
+
+nvinfer1::ICudaEngine* getGeneratorShuffleEngine(
+    const std::vector<nvinfer1::Dims>& decoderStateSizes, int attentionSize)
+{
+    nvinfer1::IBuilder* shuffleBuilder = nvinfer1::createInferBuilder(gLogger.getTRTLogger());
+    assert(shuffleBuilder != nullptr);
+    nvinfer1::IBuilderConfig* shuffleConfig = shuffleBuilder->createBuilderConfig();
+    shuffleBuilder->setMaxBatchSize(gMaxBatchSize);
+    shuffleConfig->setMaxWorkspaceSize(gMaxWorkspaceSize);
+    if (gFp16)
+    {
+        shuffleConfig->setFlag(BuilderFlag::kFP16);
+    }
+    if (gInt8)
+    {
+        shuffleConfig->setFlag(BuilderFlag::kINT8);
+    }
+
+    nvinfer1::INetworkDefinition* shuffleNetwork = shuffleBuilder->createNetwork();
+
+    nvinfer1::Dims sourceRayIndicesDims{1, {gBeamWidth}, {nvinfer1::DimensionType::kINDEX}};
+    auto sourceRayIndicesTensor
+        = shuffleNetwork->addInput("source_ray_indices", nvinfer1::DataType::kINT32, sourceRayIndicesDims);
+    assert(sourceRayIndicesTensor != nullptr);
+
+    std::vector<nvinfer1::ITensor*> previousOutputDecoderStatesTensors(decoderStateSizes.size());
+    for (int i = 0; i < static_cast<int>(decoderStateSizes.size()); ++i)
+    {
+        std::stringstream ss;
+        ss << "previous_output_decoder_states_" << i;
+        nvinfer1::Dims statesDims;
+        {
+            statesDims.nbDims = decoderStateSizes[i].nbDims + 1;
+            statesDims.d[0] = gBeamWidth;
+            statesDims.type[0] = nvinfer1::DimensionType::kINDEX;
+            for (int j = 0; j < decoderStateSizes[i].nbDims; ++j)
+            {
+                statesDims.d[j + 1] = decoderStateSizes[i].d[j];
+                statesDims.type[j + 1] = decoderStateSizes[i].type[j];
+            }
+        }
+        previousOutputDecoderStatesTensors[i] = shuffleNetwork->addInput(
+            ss.str().c_str(), gFp16 ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT, statesDims);
+        assert(previousOutputDecoderStatesTensors[i] != nullptr);
+    }
+
+    nvinfer1::ITensor* previousOutputAttentionTensor = nullptr;
+    if (gFeedAttentionToInput)
+    {
+        nvinfer1::Dims previousOutputAttentionDims{
+            2, {gBeamWidth, attentionSize}, {nvinfer1::DimensionType::kINDEX, nvinfer1::DimensionType::kCHANNEL}};
+        previousOutputAttentionTensor = shuffleNetwork->addInput("previous_output_attention",
+            gFp16 ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT, previousOutputAttentionDims);
+        assert(previousOutputAttentionTensor != nullptr);
+    }
+
+    for (int i = 0; i < static_cast<int>(decoderStateSizes.size()); ++i)
+    {
+        auto gatherLayer
+            = shuffleNetwork->addGather(*previousOutputDecoderStatesTensors[i], *sourceRayIndicesTensor, 0);
+        assert(gatherLayer != nullptr);
+        {
+            std::stringstream ss;
+            ss << "Shuffle decoder states " << i;
+            gatherLayer->setName(ss.str().c_str());
+        }
+        auto inputDecoderHiddenStatesTensor = gatherLayer->getOutput(0);
+        assert(inputDecoderHiddenStatesTensor != nullptr);
+        std::stringstream ss;
+        ss << "input_decoder_states_" << i;
+        inputDecoderHiddenStatesTensor->setName(ss.str().c_str());
+        shuffleNetwork->markOutput(*inputDecoderHiddenStatesTensor);
+        inputDecoderHiddenStatesTensor->setType(gFp16 ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT);
+    }
+
+    if (gFeedAttentionToInput)
+    {
+        auto gatherLayer = shuffleNetwork->addGather(*previousOutputAttentionTensor, *sourceRayIndicesTensor, 0);
+        assert(gatherLayer != nullptr);
+        gatherLayer->setName("Shuffle attention");
+        auto inputAttentionTensor = gatherLayer->getOutput(0);
+        assert(inputAttentionTensor != nullptr);
+        inputAttentionTensor->setName("input_attention");
+        shuffleNetwork->markOutput(*inputAttentionTensor);
+        inputAttentionTensor->setType(gFp16 ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT);
+    }
+
+    samplesCommon::setDummyInt8Scales(shuffleConfig, shuffleNetwork);
+    samplesCommon::enableDLA(shuffleBuilder, shuffleConfig, gUseDLACore);
+    auto res = shuffleBuilder->buildEngineWithConfig(*shuffleNetwork, *shuffleConfig);
+    shuffleNetwork->destroy();
+    shuffleBuilder->destroy();
+    shuffleConfig->destroy();
+    return res;
+}
+
+//! \brief assign device pointers to the correct location in the bindings vector.
+//!
+//! Given a binding map which stores the name to device pointer mapping, in
+//! a generic fashion, insert into the bindings vector the device pointer
+//! at the correct index for a given engine.
+void processBindings(
+    std::vector<void*>& bindings, std::unordered_map<std::string, void*>& bindingMap, nvinfer1::ICudaEngine* engine)
+{
+    for (auto& a : bindingMap)
+    {
+        auto bindIdx = engine->getBindingIndex(a.first.c_str());
+        assert(bindIdx >= 0 && bindIdx < engine->getNbBindings());
+        bindings[bindIdx] = a.second;
+    }
+}
+
+int main(int argc, char** argv)
+{
+    auto sampleTest = gLogger.defineTest(gSampleName, argc, argv);
+
+    gLogger.reportTestStart(sampleTest);
+
+    samplesCommon::Args args;
+    bool argsOK = parseNMTArgs(args, argc, argv);
+    if (args.help)
+    {
+        return EXIT_SUCCESS;
+    }
+    if (!argsOK)
+    {
+        return gLogger.reportFail(sampleTest);
+    }
+    if (gVerbose)
+    {
+        setReportableSeverity(Severity::kVERBOSE);
+    }
+
+    // Set up output vocabulary
+    {
+        std::string vocabularyFilePath = gOutputVocabularyFileName;
+        std::ifstream vocabStream(locateNMTFile(vocabularyFilePath));
+        if (!vocabStream.good())
+        {
+            gLogError << "Cannot open file " << vocabularyFilePath << std::endl;
+            return gLogger.reportFail(sampleTest);
+        }
+        vocabStream >> *gOutputVocabulary;
+    }
+
+    cudaStream_t stream;
+    CUDA_CHECK(cudaStreamCreate(&stream));
+
+    auto outputSequenceProperties = getOutputSequenceProperties();
+    auto dataReader = getDataReader();
+    auto inputEmbedder = getInputEmbedder();
+    auto outputEmbedder = getOutputEmbedder();
+    auto encoder = getEncoder();
+    auto decoder = getDecoder();
+    auto alignment = getAlignment();
+    auto context = getContext();
+    auto attention = getAttention();
+    auto projection = getProjection();
+    auto likelihood = getLikelihood();
+    auto searchPolicy
+        = getSearchPolicy(outputSequenceProperties->getEndSequenceId(), likelihood->getLikelihoodCombinationOperator());
+    auto dataWriter = getDataWriter();
+
+    if (gPrintComponentInfo)
+    {
+        gLogInfo << "Component Info:" << std::endl;
+        gLogInfo << "- Data Reader: " << dataReader->getInfo() << std::endl;
+        gLogInfo << "- Input Embedder: " << inputEmbedder->getInfo() << std::endl;
+        gLogInfo << "- Output Embedder: " << outputEmbedder->getInfo() << std::endl;
+        gLogInfo << "- Encoder: " << encoder->getInfo() << std::endl;
+        gLogInfo << "- Decoder: " << decoder->getInfo() << std::endl;
+        gLogInfo << "- Alignment: " << alignment->getInfo() << std::endl;
+        gLogInfo << "- Context: " << context->getInfo() << std::endl;
+        gLogInfo << "- Attention: " << attention->getInfo() << std::endl;
+        gLogInfo << "- Projection: " << projection->getInfo() << std::endl;
+        gLogInfo << "- Likelihood: " << likelihood->getInfo() << std::endl;
+        gLogInfo << "- Search Policy: " << searchPolicy->getInfo() << std::endl;
+        gLogInfo << "- Data Writer: " << dataWriter->getInfo() << std::endl;
+        gLogInfo << "End of Component Info" << std::endl;
+    }
+
+    std::vector<nvinfer1::Dims> stateSizes = decoder->getStateSizes();
+
+    // A number of consistency checks between components
+    assert(alignment->getSourceStatesSize() == encoder->getMemoryStatesSize());
+    if (gInitializeDecoderFromEncoderHiddenStates)
+    {
+        std::vector<nvinfer1::Dims> encoderStateSizes = encoder->getStateSizes();
+        assert(stateSizes.size() == encoderStateSizes.size());
+        for (int i = 0; i < static_cast<int>(stateSizes.size()); ++i)
+            assert(nmtSample::getVolume(stateSizes[i]) == nmtSample::getVolume(encoderStateSizes[i]));
+    }
+    assert(projection->getOutputSize() == outputEmbedder->getInputDimensionSize());
+
+    auto inputOriginalHostBuffer
+        = std::make_shared<nmtSample::PinnedHostBuffer<int>>(gMaxBatchSize * gMaxInputSequenceLength);
+    auto inputHostBuffer = std::make_shared<nmtSample::PinnedHostBuffer<int>>(gMaxBatchSize * gMaxInputSequenceLength);
+    auto inputOriginalSequenceLengthsHostBuffer = std::make_shared<nmtSample::PinnedHostBuffer<int>>(gMaxBatchSize);
+    auto inputSequenceLengthsHostBuffer = std::make_shared<nmtSample::PinnedHostBuffer<int>>(gMaxBatchSize);
+    auto maxOutputSequenceLengthsHostBuffer = std::make_shared<nmtSample::PinnedHostBuffer<int>>(gMaxBatchSize);
+    auto outputSequenceLengthsHostBuffer = std::make_shared<nmtSample::PinnedHostBuffer<int>>(gMaxBatchSize);
+    auto outputCombinedLikelihoodHostBuffer
+        = std::make_shared<nmtSample::PinnedHostBuffer<float>>(gMaxBatchSize * gBeamWidth);
+    auto outputVocabularyIndicesHostBuffer
+        = std::make_shared<nmtSample::PinnedHostBuffer<int>>(gMaxBatchSize * gBeamWidth);
+    auto outputRayOptionIndicesHostBuffer
+        = std::make_shared<nmtSample::PinnedHostBuffer<int>>(gMaxBatchSize * gBeamWidth);
+    auto sourceRayIndicesHostBuffer = std::make_shared<nmtSample::PinnedHostBuffer<int>>(gMaxBatchSize * gBeamWidth);
+    auto sourceLikelihoodsHostBuffer = std::make_shared<nmtSample::PinnedHostBuffer<float>>(gMaxBatchSize * gBeamWidth);
+
+    // Allocated buffers on GPU to be used as inputs and outputs for TenorRT
+    auto inputEncoderDeviceBuffer
+        = std::make_shared<nmtSample::DeviceBuffer<int>>(gMaxBatchSize * gMaxInputSequenceLength);
+    auto inputSequenceLengthsDeviceBuffer = std::make_shared<nmtSample::DeviceBuffer<int>>(gMaxBatchSize);
+    auto inputSequenceLengthsReplicatedDeviceBuffer
+        = std::make_shared<nmtSample::DeviceBuffer<int>>(gMaxBatchSize * gBeamWidth);
+    auto memoryStatesDeviceBuffer = std::make_shared<nmtSample::DeviceBuffer<float>>(
+        gMaxBatchSize * gMaxInputSequenceLength * encoder->getMemoryStatesSize());
+    auto attentionKeysDeviceBuffer
+        = std::make_shared<nmtSample::DeviceBuffer<float>>((alignment->getAttentionKeySize() > 0)
+                ? gMaxBatchSize * gMaxInputSequenceLength * alignment->getAttentionKeySize()
+                : 0);
+    std::vector<nmtSample::DeviceBuffer<float>::ptr> encoderStatesLastTimestepDeviceBuffers;
+    for (auto stateSize : stateSizes)
+        encoderStatesLastTimestepDeviceBuffers.push_back(
+            std::make_shared<nmtSample::DeviceBuffer<float>>(gMaxBatchSize * nmtSample::getVolume(stateSize)));
+    std::vector<nmtSample::DeviceBuffer<float>::ptr> inputDecoderStatesDeviceBuffers;
+    for (auto stateSize : stateSizes)
+        inputDecoderStatesDeviceBuffers.push_back(std::make_shared<nmtSample::DeviceBuffer<float>>(
+            gMaxBatchSize * gBeamWidth * nmtSample::getVolume(stateSize)));
+    std::vector<nmtSample::DeviceBuffer<float>::ptr> outputDecoderStatesDeviceBuffers;
+    for (auto stateSize : stateSizes)
+        outputDecoderStatesDeviceBuffers.push_back(std::make_shared<nmtSample::DeviceBuffer<float>>(
+            gMaxBatchSize * gBeamWidth * nmtSample::getVolume(stateSize)));
+    auto inputAttentionDeviceBuffer = std::make_shared<nmtSample::DeviceBuffer<float>>(
+        gFeedAttentionToInput ? gMaxBatchSize * gBeamWidth * attention->getAttentionSize() : 0);
+    auto outputAttentionDeviceBuffer = std::make_shared<nmtSample::DeviceBuffer<float>>(
+        gFeedAttentionToInput ? gMaxBatchSize * gBeamWidth * attention->getAttentionSize() : 0);
+    auto outputCombinedLikelihoodDeviceBuffer
+        = std::make_shared<nmtSample::DeviceBuffer<float>>(gMaxBatchSize * gBeamWidth);
+    auto outputRayOptionIndicesDeviceBuffer
+        = std::make_shared<nmtSample::DeviceBuffer<int>>(gMaxBatchSize * gBeamWidth);
+    auto sourceRayIndicesDeviceBuffer = std::make_shared<nmtSample::DeviceBuffer<int>>(gMaxBatchSize * gBeamWidth);
+    auto inputDecoderDeviceBuffer = std::make_shared<nmtSample::DeviceBuffer<int>>(gMaxBatchSize * gBeamWidth);
+    auto inputLikelihoodsDeviceBuffer = std::make_shared<nmtSample::DeviceBuffer<float>>(gMaxBatchSize * gBeamWidth);
+
+    std::vector<nmtSample::DeviceBuffer<float>::ptr> zeroInputEncoderStatesDeviceBuffers;
+    for (auto stateSize : stateSizes)
+    {
+        auto buf = std::make_shared<nmtSample::DeviceBuffer<float>>(gMaxBatchSize * nmtSample::getVolume(stateSize));
+        CUDA_CHECK(cudaMemsetAsync(*buf, 0, gMaxBatchSize * nmtSample::getVolume(stateSize) * sizeof(float), stream));
+        zeroInputEncoderStatesDeviceBuffers.push_back(buf);
+    }
+
+    std::vector<nmtSample::DeviceBuffer<float>::ptr> zeroInputDecoderStatesDeviceBuffers;
+    for (auto stateSize : stateSizes)
+    {
+        auto buf = std::make_shared<nmtSample::DeviceBuffer<float>>(
+            gMaxBatchSize * gBeamWidth * nmtSample::getVolume(stateSize));
+        CUDA_CHECK(cudaMemsetAsync(
+            *buf, 0, gMaxBatchSize * gBeamWidth * nmtSample::getVolume(stateSize) * sizeof(float), stream));
+        zeroInputDecoderStatesDeviceBuffers.push_back(buf);
+    }
+
+    auto zeroInputAttentionDeviceBuffer = std::make_shared<nmtSample::DeviceBuffer<float>>(
+        gFeedAttentionToInput ? gMaxBatchSize * gBeamWidth * attention->getAttentionSize() : 0);
+    if (gFeedAttentionToInput)
+    {
+        CUDA_CHECK(cudaMemsetAsync(*zeroInputAttentionDeviceBuffer, 0,
+            gMaxBatchSize * gBeamWidth * attention->getAttentionSize() * sizeof(float), stream));
+    }
+    auto startSeqInputDecoderDeviceBuffer = std::make_shared<nmtSample::DeviceBuffer<int>>(gMaxBatchSize * gBeamWidth);
+    {
+        auto startSeqInputDecoderHostBuffer
+            = std::make_shared<nmtSample::PinnedHostBuffer<int>>(gMaxBatchSize * gBeamWidth);
+        std::fill_n((int*) *startSeqInputDecoderHostBuffer, gMaxBatchSize * gBeamWidth,
+            outputSequenceProperties->getStartSequenceId());
+        CUDA_CHECK(cudaMemcpyAsync(*startSeqInputDecoderDeviceBuffer, *startSeqInputDecoderHostBuffer,
+            gMaxBatchSize * gBeamWidth * sizeof(int), cudaMemcpyHostToDevice, stream));
+        CUDA_CHECK(cudaStreamSynchronize(stream));
+    }
+    auto zeroInitializeDecoderIndicesDeviceBuffer
+        = std::make_shared<nmtSample::DeviceBuffer<int>>(gMaxBatchSize * gBeamWidth);
+    CUDA_CHECK(cudaMemsetAsync(
+        *zeroInitializeDecoderIndicesDeviceBuffer, 0, gMaxBatchSize * gBeamWidth * sizeof(int), stream));
+    auto initialInputLikelihoodsDeviceBuffer
+        = std::make_shared<nmtSample::DeviceBuffer<float>>(gMaxBatchSize * gBeamWidth);
+    {
+        auto likelihoodCombinationOperator = likelihood->getLikelihoodCombinationOperator();
+        auto initialInputLikelihoodsHostBuffer
+            = std::make_shared<nmtSample::PinnedHostBuffer<float>>(gMaxBatchSize * gBeamWidth);
+        for (int sampleId = 0; sampleId < gMaxBatchSize; ++sampleId)
+        {
+            (*initialInputLikelihoodsHostBuffer)[sampleId * gBeamWidth] = likelihoodCombinationOperator->init();
+            for (int rayId = 1; rayId < gBeamWidth; ++rayId)
+                (*initialInputLikelihoodsHostBuffer)[sampleId * gBeamWidth + rayId]
+                    = likelihoodCombinationOperator->smallerThanMinimalLikelihood();
+        }
+        CUDA_CHECK(cudaMemcpyAsync(*initialInputLikelihoodsDeviceBuffer, *initialInputLikelihoodsHostBuffer,
+            gMaxBatchSize * gBeamWidth * sizeof(float), cudaMemcpyHostToDevice, stream));
+        CUDA_CHECK(cudaStreamSynchronize(stream));
+    }
+    auto zeroReplicateLikelihoodsIndicesDeviceBuffer
+        = std::make_shared<nmtSample::DeviceBuffer<int>>(gMaxBatchSize * gBeamWidth);
+    CUDA_CHECK(cudaMemsetAsync(
+        *zeroReplicateLikelihoodsIndicesDeviceBuffer, 0, gMaxBatchSize * gBeamWidth * sizeof(int), stream));
+
+    // Create TensorRT engines
+    nvinfer1::ICudaEngine* encoderEngine = getEncoderEngine(inputEmbedder, encoder, alignment);
+    nvinfer1::ICudaEngine* generatorEngine
+        = getGeneratorEngine(outputEmbedder, decoder, alignment, context, attention, projection, likelihood);
+    nvinfer1::ICudaEngine* generatorShuffleEngine
+        = getGeneratorShuffleEngine(decoder->getStateSizes(), attention->getAttentionSize());
+
+    // Setup TensorRT bindings
+    std::vector<void*> encoderBindings(encoderEngine->getNbBindings());
+    std::unordered_map<std::string, void*> encBindingMap;
+    encBindingMap["input_encoder_data"] = *inputEncoderDeviceBuffer;
+    encBindingMap["actual_input_sequence_lengths"] = *inputSequenceLengthsDeviceBuffer;
+    encBindingMap["actual_input_sequence_lengths_with_index_dim"] = *inputSequenceLengthsDeviceBuffer;
+    encBindingMap["actual_input_sequence_lengths_replicated"] = *inputSequenceLengthsReplicatedDeviceBuffer;
+    encBindingMap["initialize_decoder_indices"] = *zeroInitializeDecoderIndicesDeviceBuffer;
+    for (int i = 0; i < static_cast<int>(stateSizes.size()); ++i)
+    {
+        std::stringstream ss;
+        ss << "input_encoder_states_" << i;
+        encBindingMap[ss.str()] = *zeroInputEncoderStatesDeviceBuffers[i];
+    }
+    encBindingMap["memory_states"] = *memoryStatesDeviceBuffer;
+    if (alignment->getAttentionKeySize() > 0)
+    {
+        encBindingMap["attention_keys"] = *attentionKeysDeviceBuffer;
+    }
+    if (gInitializeDecoderFromEncoderHiddenStates)
+    {
+        for (int i = 0; i < static_cast<int>(stateSizes.size()); ++i)
+        {
+            std::stringstream ss;
+            ss << "input_decoder_states_" << i;
+            encBindingMap[ss.str()] = *inputDecoderStatesDeviceBuffers[i];
+        }
+    }
+    processBindings(encoderBindings, encBindingMap, encoderEngine);
+
+    std::vector<void*> generatorBindings(generatorEngine->getNbBindings());
+    std::unordered_map<std::string, void*> genBindingMap;
+    genBindingMap["input_decoder_data"] = *inputDecoderDeviceBuffer;
+    for (int i = 0; i < static_cast<int>(stateSizes.size()); ++i)
+    {
+        std::stringstream ss;
+        ss << "input_decoder_states_" << i;
+        genBindingMap[ss.str()] = *inputDecoderStatesDeviceBuffers[i];
+    }
+    genBindingMap["actual_input_sequence_lengths_replicated"] = *inputSequenceLengthsReplicatedDeviceBuffer;
+    genBindingMap["memory_states"] = *memoryStatesDeviceBuffer;
+    if (alignment->getAttentionKeySize() > 0)
+    {
+        genBindingMap["attention_keys"] = *attentionKeysDeviceBuffer;
+    }
+    for (int i = 0; i < static_cast<int>(stateSizes.size()); ++i)
+    {
+        std::stringstream ss;
+        ss << "output_decoder_states_" << i;
+        genBindingMap[ss.str()] = *outputDecoderStatesDeviceBuffers[i];
+    }
+    genBindingMap["output_combined_likelihoods"] = *outputCombinedLikelihoodDeviceBuffer;
+    genBindingMap["output_vocabulary_indices"] = *inputDecoderDeviceBuffer;
+    genBindingMap["output_ray_option_indices"] = *outputRayOptionIndicesDeviceBuffer;
+    if (gFeedAttentionToInput)
+    {
+        genBindingMap["input_attention"] = *inputAttentionDeviceBuffer;
+        genBindingMap["output_attention"] = *outputAttentionDeviceBuffer;
+    }
+    genBindingMap["input_likelihoods"] = *inputLikelihoodsDeviceBuffer;
+    genBindingMap["replicate_likelihoods_indices"] = *zeroReplicateLikelihoodsIndicesDeviceBuffer;
+    processBindings(generatorBindings, genBindingMap, generatorEngine);
+
+    std::vector<void*> generatorBindingsFirstStep = generatorBindings;
+    std::unordered_map<std::string, void*> genBindingFirstStepMap;
+    genBindingFirstStepMap["input_decoder_data"] = *startSeqInputDecoderDeviceBuffer;
+    if (!gInitializeDecoderFromEncoderHiddenStates)
+    {
+        for (int i = 0; i < static_cast<int>(stateSizes.size()); ++i)
+        {
+            std::stringstream ss;
+            ss << "input_decoder_states_" << i;
+            genBindingFirstStepMap[ss.str()] = *zeroInputDecoderStatesDeviceBuffers[i];
+        }
+    }
+    if (gFeedAttentionToInput)
+    {
+        genBindingFirstStepMap["input_attention"] = *zeroInputAttentionDeviceBuffer;
+    }
+    genBindingFirstStepMap["input_likelihoods"] = *initialInputLikelihoodsDeviceBuffer;
+    processBindings(generatorBindingsFirstStep, genBindingFirstStepMap, generatorEngine);
+
+    std::vector<void*> generatorShuffleBindings(generatorShuffleEngine->getNbBindings());
+    std::unordered_map<std::string, void*> genShuffleBindingMap;
+    genShuffleBindingMap["source_ray_indices"] = *sourceRayIndicesDeviceBuffer;
+    for (int i = 0; i < static_cast<int>(stateSizes.size()); ++i)
+    {
+        std::stringstream ss;
+        ss << "previous_output_decoder_states_" << i;
+        genShuffleBindingMap[ss.str()] = *outputDecoderStatesDeviceBuffers[i];
+    }
+    for (int i = 0; i < static_cast<int>(stateSizes.size()); ++i)
+    {
+        std::stringstream ss;
+        ss << "input_decoder_states_" << i;
+        genShuffleBindingMap[ss.str()] = *inputDecoderStatesDeviceBuffers[i];
+    }
+    if (gFeedAttentionToInput)
+    {
+        genShuffleBindingMap["previous_output_attention"] = *outputAttentionDeviceBuffer;
+        genShuffleBindingMap["input_attention"] = *inputAttentionDeviceBuffer;
+    }
+    processBindings(generatorShuffleBindings, genShuffleBindingMap, generatorShuffleEngine);
+
+    // Create Tensor RT contexts
+    nvinfer1::IExecutionContext* encoderContext = encoderEngine->createExecutionContext();
+    nvinfer1::IExecutionContext* generatorContext = generatorEngine->createExecutionContext();
+    nvinfer1::IExecutionContext* generatorShuffleContext = generatorShuffleEngine->createExecutionContext();
+
+    std::vector<SimpleProfiler> profilers;
+    if (gEnableProfiling)
+    {
+        profilers.push_back(SimpleProfiler("Host"));
+        profilers.push_back(SimpleProfiler("Encoder"));
+        profilers.push_back(SimpleProfiler("Decoder"));
+        profilers.push_back(SimpleProfiler("Beam shuffle"));
+        encoderContext->setProfiler(&profilers[1]);
+        generatorContext->setProfiler(&profilers[2]);
+        generatorShuffleContext->setProfiler(&profilers[3]);
+    }
+
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+
+    dataWriter->initialize();
+
+    std::vector<int> outputHostBuffer;
+    auto startDataRead = std::chrono::high_resolution_clock::now();
+    int inputSamplesRead = dataReader->read(
+        gMaxBatchSize, gMaxInputSequenceLength, *inputOriginalHostBuffer, *inputOriginalSequenceLengthsHostBuffer);
+    if (gEnableProfiling)
+        profilers[0].reportLayerTime("Data Read",
+            std::chrono::duration<float, std::milli>(std::chrono::high_resolution_clock::now() - startDataRead)
+                .count());
+    // Outer loop over batches of samples
+    auto startLatency = std::chrono::high_resolution_clock::now();
+    int batchCount = 0;
+    while (inputSamplesRead > 0)
+    {
+        ++batchCount;
+
+        // Sort input sequences in the batch in the order of decreasing length
+        // The idea is that shorter input sequences gets translated faster so we can reduce batch size quickly for the
+        // generator
+        auto startBatchSort = std::chrono::high_resolution_clock::now();
+        std::vector<int> samplePositions(inputSamplesRead);
+        {
+            std::vector<std::pair<int, int>> sequenceSampleIdAndLength(inputSamplesRead);
+            for (int sampleId = 0; sampleId < inputSamplesRead; ++sampleId)
+                sequenceSampleIdAndLength[sampleId]
+                    = std::make_pair(sampleId, ((const int*) *inputOriginalSequenceLengthsHostBuffer)[sampleId]);
+            std::sort(sequenceSampleIdAndLength.begin(), sequenceSampleIdAndLength.end(),
+                [](const std::pair<int, int>& a, const std::pair<int, int>& b) -> bool { return a.second > b.second; });
+            for (int position = 0; position < inputSamplesRead; ++position)
+            {
+                int sampleId = sequenceSampleIdAndLength[position].first;
+                ((int*) *inputSequenceLengthsHostBuffer)[position]
+                    = ((const int*) *inputOriginalSequenceLengthsHostBuffer)[sampleId];
+                std::copy_n(((const int*) *inputOriginalHostBuffer) + sampleId * gMaxInputSequenceLength,
+                    gMaxInputSequenceLength, ((int*) *inputHostBuffer) + position * gMaxInputSequenceLength);
+                samplePositions[sampleId] = position;
+            }
+        }
+        if (gEnableProfiling)
+            profilers[0].reportLayerTime("Intra-batch Sort",
+                std::chrono::duration<float, std::milli>(std::chrono::high_resolution_clock::now() - startBatchSort)
+                    .count());
+
+        CUDA_CHECK(cudaMemcpyAsync(*inputEncoderDeviceBuffer, *inputHostBuffer,
+            inputSamplesRead * gMaxInputSequenceLength * sizeof(int), cudaMemcpyHostToDevice, stream));
+        CUDA_CHECK(cudaMemcpyAsync(*inputSequenceLengthsDeviceBuffer, *inputSequenceLengthsHostBuffer,
+            inputSamplesRead * sizeof(int), cudaMemcpyHostToDevice, stream));
+
+        // Overlap host and device: Read data for the next batch while encode for this one is running
+        std::future<int> nextInputSamplesReadFuture = std::async(std::launch::async, [&]() {
+            return dataReader->read(gMaxBatchSize, gMaxInputSequenceLength, *inputOriginalHostBuffer,
+                *inputOriginalSequenceLengthsHostBuffer);
+        });
+
+        encoderContext->enqueue(inputSamplesRead, &encoderBindings[0], stream, nullptr);
+
+        // Limit output sequences length to input_sequence_length * 2
+        std::transform((const int*) *inputSequenceLengthsHostBuffer,
+            (const int*) *inputSequenceLengthsHostBuffer + inputSamplesRead, (int*) *maxOutputSequenceLengthsHostBuffer,
+            [](int i) {
+                int r = i * 2;
+                if (gMaxOutputSequenceLength >= 0)
+                    r = std::min(r, gMaxOutputSequenceLength);
+                return r;
+            });
+        searchPolicy->initialize(inputSamplesRead, *maxOutputSequenceLengthsHostBuffer);
+        int batchMaxOutputSequenceLength = *std::max_element(
+            (int*) *maxOutputSequenceLengthsHostBuffer, (int*) *maxOutputSequenceLengthsHostBuffer + inputSamplesRead);
+        outputHostBuffer.resize(gMaxBatchSize * batchMaxOutputSequenceLength);
+
+        // Inner loop over generator timesteps
+        int validSampleCount = searchPolicy->getTailWithNoWorkRemaining();
+        for (int outputTimestep = 0; (outputTimestep < batchMaxOutputSequenceLength) && (validSampleCount > 0);
+             ++outputTimestep)
+        {
+            // Generator initialization and beam shuffling
+            if (outputTimestep == 0)
+            {
+                generatorContext->enqueue(validSampleCount, &generatorBindingsFirstStep[0], stream, nullptr);
+            }
+            else
+            {
+                generatorShuffleContext->enqueue(validSampleCount, &generatorShuffleBindings[0], stream, nullptr);
+                generatorContext->enqueue(validSampleCount, &generatorBindings[0], stream, nullptr);
+            }
+
+            CUDA_CHECK(cudaMemcpyAsync(*outputCombinedLikelihoodHostBuffer, *outputCombinedLikelihoodDeviceBuffer,
+                validSampleCount * gBeamWidth * sizeof(float), cudaMemcpyDeviceToHost, stream));
+            CUDA_CHECK(cudaMemcpyAsync(*outputVocabularyIndicesHostBuffer, *inputDecoderDeviceBuffer,
+                validSampleCount * gBeamWidth * sizeof(int), cudaMemcpyDeviceToHost, stream));
+            CUDA_CHECK(cudaMemcpyAsync(*outputRayOptionIndicesHostBuffer, *outputRayOptionIndicesDeviceBuffer,
+                validSampleCount * gBeamWidth * sizeof(int), cudaMemcpyDeviceToHost, stream));
+
+            CUDA_CHECK(cudaStreamSynchronize(stream));
+
+            auto startBeamSearch = std::chrono::high_resolution_clock::now();
+            searchPolicy->processTimestep(validSampleCount, *outputCombinedLikelihoodHostBuffer,
+                *outputVocabularyIndicesHostBuffer, *outputRayOptionIndicesHostBuffer, *sourceRayIndicesHostBuffer,
+                *sourceLikelihoodsHostBuffer);
+            if (gEnableProfiling)
+                profilers[0].reportLayerTime("Beam Search",
+                    std::chrono::duration<float, std::milli>(
+                        std::chrono::high_resolution_clock::now() - startBeamSearch)
+                        .count());
+
+            CUDA_CHECK(cudaMemcpyAsync(*sourceRayIndicesDeviceBuffer, *sourceRayIndicesHostBuffer,
+                validSampleCount * gBeamWidth * sizeof(int), cudaMemcpyHostToDevice, stream));
+            CUDA_CHECK(cudaMemcpyAsync(*inputLikelihoodsDeviceBuffer, *sourceLikelihoodsHostBuffer,
+                validSampleCount * gBeamWidth * sizeof(float), cudaMemcpyHostToDevice, stream));
+
+            validSampleCount = searchPolicy->getTailWithNoWorkRemaining();
+        } // for(int outputTimestep
+
+        auto startBacktrack = std::chrono::high_resolution_clock::now();
+        searchPolicy->readGeneratedResult(
+            inputSamplesRead, batchMaxOutputSequenceLength, &outputHostBuffer[0], *outputSequenceLengthsHostBuffer);
+        if (gEnableProfiling)
+            profilers[0].reportLayerTime("Read Result",
+                std::chrono::duration<float, std::milli>(std::chrono::high_resolution_clock::now() - startBacktrack)
+                    .count());
+
+        auto startDataWrite = std::chrono::high_resolution_clock::now();
+        for (int sampleId = 0; sampleId < inputSamplesRead; ++sampleId)
+        {
+            int position = samplePositions[sampleId];
+            dataWriter->write(&outputHostBuffer[0] + position * batchMaxOutputSequenceLength,
+                ((const int*) *outputSequenceLengthsHostBuffer)[position],
+                ((const int*) *inputSequenceLengthsHostBuffer)[position]);
+        }
+        if (gEnableProfiling)
+            profilers[0].reportLayerTime("Data Write",
+                std::chrono::duration<float, std::milli>(std::chrono::high_resolution_clock::now() - startDataWrite)
+                    .count());
+
+        auto startDataRead = std::chrono::high_resolution_clock::now();
+        inputSamplesRead = nextInputSamplesReadFuture.get();
+        if (gEnableProfiling)
+            profilers[0].reportLayerTime("Data Read",
+                std::chrono::duration<float, std::milli>(std::chrono::high_resolution_clock::now() - startDataRead)
+                    .count());
+    }
+    float totalLatency
+        = std::chrono::duration<float, std::milli>(std::chrono::high_resolution_clock::now() - startLatency).count();
+
+    dataWriter->finalize();
+    float score
+        = gDataWriterStr == "bleu" ? static_cast<nmtSample::BLEUScoreWriter*>(dataWriter.get())->getScore() : -1.0f;
+
+    if (gDataWriterStr == "benchmark")
+    {
+        gLogInfo << "Average latency (without data read) = " << totalLatency / static_cast<float>(batchCount) << " ms"
+                 << std::endl;
+    }
+
+    if (gEnableProfiling)
+    {
+        if (gAggregateProfiling)
+        {
+            SimpleProfiler aggregateProfiler("Aggregate", profilers);
+            gLogInfo << aggregateProfiler << std::endl;
+        }
+        else
+        {
+            for (const auto& profiler : profilers)
+                gLogInfo << profiler << std::endl;
+        }
+    }
+
+    encoderContext->destroy();
+    generatorContext->destroy();
+    generatorShuffleContext->destroy();
+
+    encoderEngine->destroy();
+    generatorEngine->destroy();
+    generatorShuffleEngine->destroy();
+
+    cudaStreamDestroy(stream);
+
+    bool pass = gDataWriterStr != "bleu" || score >= 25.0f;
+
+    return gLogger.reportTest(sampleTest, pass);
+}
diff --git a/samples/opensource/sampleNMT/trtUtil.cpp b/samples/opensource/sampleNMT/trtUtil.cpp
new file mode 100644
index 00000000..b2c51fa7
--- /dev/null
+++ b/samples/opensource/sampleNMT/trtUtil.cpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "trtUtil.h"
+
+#include <cassert>
+#include <functional>
+#include <numeric>
+
+namespace nmtSample
+{
+int inferTypeToBytes(nvinfer1::DataType t)
+{
+    switch (t)
+    {
+    case nvinfer1::DataType::kFLOAT: return sizeof(float); break;
+    case nvinfer1::DataType::kHALF: return sizeof(int16_t); break;
+    default: assert(0); break;
+    }
+};
+
+int getVolume(nvinfer1::Dims dims)
+{
+    return std::accumulate(dims.d, dims.d + dims.nbDims, 1, std::multiplies<int>());
+}
+
+std::vector<float> resizeWeights(int rows, int cols, int rowsNew, int colsNew, const float* memory)
+{
+    std::vector<float> result(rowsNew * colsNew);
+    for (int row = 0; row < rows; row++)
+    {
+        for (int col = 0; col < cols; col++)
+        {
+            result[row * colsNew + col] = memory[row * cols + col];
+        }
+    }
+    return result;
+}
+
+} // namespace nmtSample
diff --git a/samples/opensource/sampleNMT/trtUtil.h b/samples/opensource/sampleNMT/trtUtil.h
new file mode 100644
index 00000000..b96da3cc
--- /dev/null
+++ b/samples/opensource/sampleNMT/trtUtil.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SAMPLE_NMT_TRT_UTIL_
+#define SAMPLE_NMT_TRT_UTIL_
+
+#include "NvInfer.h"
+#include <vector>
+
+namespace nmtSample
+{
+int inferTypeToBytes(nvinfer1::DataType t);
+
+int getVolume(nvinfer1::Dims dims);
+
+// Resize weights matrix to larger size
+std::vector<float> resizeWeights(int rows, int cols, int rowsNew, int colsNew, const float* memory);
+
+} // namespace nmtSample
+
+#endif // SAMPLE_NMT_TRT_UTIL_
diff --git a/samples/opensource/sampleOnnxMNIST/sampleOnnxMNIST.cpp b/samples/opensource/sampleOnnxMNIST/sampleOnnxMNIST.cpp
index 60b39764..8b386612 100644
--- a/samples/opensource/sampleOnnxMNIST/sampleOnnxMNIST.cpp
+++ b/samples/opensource/sampleOnnxMNIST/sampleOnnxMNIST.cpp
@@ -78,7 +78,8 @@ class SampleOnnxMNIST
     //! \brief Parses an ONNX model for MNIST and creates a TensorRT network
     //!
     bool constructNetwork(SampleUniquePtr<nvinfer1::IBuilder>& builder,
-        SampleUniquePtr<nvinfer1::INetworkDefinition>& network, SampleUniquePtr<nvonnxparser::IParser>& parser);
+        SampleUniquePtr<nvinfer1::INetworkDefinition>& network, SampleUniquePtr<nvinfer1::IBuilderConfig>& config,
+        SampleUniquePtr<nvonnxparser::IParser>& parser);
 
     //!
     //! \brief Reads the input  and stores the result in a managed buffer
@@ -113,19 +114,26 @@ bool SampleOnnxMNIST::build()
         return false;
     }
 
+    auto config = SampleUniquePtr<nvinfer1::IBuilderConfig>(builder->createBuilderConfig());
+    if (!config)
+    {
+        return false;
+    }
+
     auto parser = SampleUniquePtr<nvonnxparser::IParser>(nvonnxparser::createParser(*network, gLogger.getTRTLogger()));
     if (!parser)
     {
         return false;
     }
 
-    auto constructed = constructNetwork(builder, network, parser);
+    auto constructed = constructNetwork(builder, network, config, parser);
     if (!constructed)
     {
         return false;
     }
 
-    mEngine = std::shared_ptr<nvinfer1::ICudaEngine>(builder->buildCudaEngine(*network), samplesCommon::InferDeleter());
+    mEngine = std::shared_ptr<nvinfer1::ICudaEngine>(
+        builder->buildEngineWithConfig(*network, *config), samplesCommon::InferDeleter());
     if (!mEngine)
     {
         return false;
@@ -151,7 +159,8 @@ bool SampleOnnxMNIST::build()
 //! \param builder Pointer to the engine builder
 //!
 bool SampleOnnxMNIST::constructNetwork(SampleUniquePtr<nvinfer1::IBuilder>& builder,
-    SampleUniquePtr<nvinfer1::INetworkDefinition>& network, SampleUniquePtr<nvonnxparser::IParser>& parser)
+    SampleUniquePtr<nvinfer1::INetworkDefinition>& network, SampleUniquePtr<nvinfer1::IBuilderConfig>& config,
+    SampleUniquePtr<nvonnxparser::IParser>& parser)
 {
     auto parsed = parser->parseFromFile(
         locateFile(mParams.onnxFileName, mParams.dataDirs).c_str(), static_cast<int>(gLogger.getReportableSeverity()));
@@ -161,15 +170,18 @@ bool SampleOnnxMNIST::constructNetwork(SampleUniquePtr<nvinfer1::IBuilder>& buil
     }
 
     builder->setMaxBatchSize(mParams.batchSize);
-    builder->setMaxWorkspaceSize(16_MB);
-    builder->setFp16Mode(mParams.fp16);
+    config->setMaxWorkspaceSize(16_MiB);
+    if (mParams.fp16)
+    {
+        config->setFlag(BuilderFlag::kFP16);
+    }
     if (mParams.int8)
     {
-        builder->setInt8Mode(true);
+        config->setFlag(BuilderFlag::kINT8);
         samplesCommon::setAllTensorScales(network.get(), 127.0f, 127.0f);
     }
 
-    samplesCommon::enableDLA(builder.get(), mParams.dlaCore);
+    samplesCommon::enableDLA(builder.get(), config.get(), mParams.dlaCore);
 
     return true;
 }
diff --git a/samples/opensource/samplePlugin/fcPlugin.h b/samples/opensource/samplePlugin/fcPlugin.h
index 08875301..4a4818d2 100644
--- a/samples/opensource/samplePlugin/fcPlugin.h
+++ b/samples/opensource/samplePlugin/fcPlugin.h
@@ -105,7 +105,13 @@ class FCPlugin : public nvinfer1::IPluginExt
 
     bool supportsFormat(nvinfer1::DataType type, nvinfer1::PluginFormat format) const override
     {
-        return (type == nvinfer1::DataType::kFLOAT || type == nvinfer1::DataType::kHALF)
+        int device;
+        CHECK(cudaGetDevice(&device));
+        cudaDeviceProp props{};
+        cudaGetDeviceProperties(&props, device);
+        int smVersion = props.major << 8 | props.minor;
+        // Half precision is supported after SM60
+        return (type == nvinfer1::DataType::kFLOAT || (type == nvinfer1::DataType::kHALF && smVersion >= 0x600))
             && format == nvinfer1::PluginFormat::kNCHW;
     }
 
@@ -327,21 +333,40 @@ class PluginFactory : public nvinfer1::IPluginFactory, public nvcaffeparser1::IP
 
     virtual IPlugin* createPlugin(const char* layerName, const nvinfer1::Weights* weights, int nbWeights) override
     {
-        // there's no way to pass parameters through from the model definition, so we have to define it here explicitly
-        static const int NB_OUTPUT_CHANNELS = 10;
-        assert(isPlugin(layerName) && nbWeights == 2);
-        assert(mPlugin.get() == nullptr);
-        mPlugin = std::unique_ptr<FCPlugin>(new FCPlugin(weights, nbWeights, NB_OUTPUT_CHANNELS));
-        return mPlugin.get();
+        try
+        {
+            // there's no way to pass parameters through from the model definition, so we have to define it here
+            // explicitly
+            static const int NB_OUTPUT_CHANNELS = 10;
+            assert(isPlugin(layerName) && nbWeights == 2);
+            assert(mPlugin.get() == nullptr);
+            mPlugin = std::unique_ptr<FCPlugin>(new FCPlugin(weights, nbWeights, NB_OUTPUT_CHANNELS));
+            return mPlugin.get();
+        }
+        catch (std::exception& e)
+        {
+            gLogError << e.what() << std::endl;
+        }
+
+        return nullptr;
     }
 
     // deserialization plugin implementation
     nvinfer1::IPlugin* createPlugin(const char* layerName, const void* serialData, size_t serialLength) override
     {
-        assert(isPlugin(layerName));
-        // This plugin object is destroyed when engine is destroyed by calling
-        // IPluginExt::destroy()
-        return new FCPlugin(serialData, serialLength);
+        try
+        {
+            assert(isPlugin(layerName));
+            // This plugin object is destroyed when engine is destroyed by calling
+            // IPluginExt::destroy()
+            return new FCPlugin(serialData, serialLength);
+        }
+        catch (std::exception& e)
+        {
+            gLogError << e.what() << std::endl;
+        }
+
+        return nullptr;
     }
 
     // User application destroys plugin when it is safe to do so.
diff --git a/samples/opensource/samplePlugin/samplePlugin.cpp b/samples/opensource/samplePlugin/samplePlugin.cpp
index d72618ca..253fc8f0 100644
--- a/samples/opensource/samplePlugin/samplePlugin.cpp
+++ b/samples/opensource/samplePlugin/samplePlugin.cpp
@@ -130,6 +130,12 @@ bool SamplePlugin::build()
         return false;
     }
 
+    auto config = SampleUniquePtr<nvinfer1::IBuilderConfig>(builder->createBuilderConfig());
+    if (!config)
+    {
+        return false;
+    }
+
     auto parser = SampleUniquePtr<nvcaffeparser1::ICaffeParser>(nvcaffeparser1::createCaffeParser());
     if (!parser)
     {
@@ -143,17 +149,25 @@ bool SamplePlugin::build()
     constructNetwork(builder, parser, network);
 
     builder->setMaxBatchSize(mParams.batchSize);
-    builder->setMaxWorkspaceSize(1_MB);
-    builder->setFp16Mode(mParams.fp16);
-    builder->setInt8Mode(mParams.int8);
-    samplesCommon::setDummyInt8Scales(builder.get(), network.get());
-    samplesCommon::enableDLA(builder.get(), mParams.dlaCore);
+    config->setMaxWorkspaceSize(1_MiB);
+    if (mParams.fp16)
+    {
+        config->setFlag(BuilderFlag::kFP16);
+    }
+    if (mParams.int8)
+    {
+        config->setFlag(BuilderFlag::kINT8);
+    }
+    samplesCommon::setDummyInt8Scales(config.get(), network.get());
+
+    samplesCommon::enableDLA(builder.get(), config.get(), mParams.dlaCore);
 
     // For illustrative purposes, we will use the builder to create a CUDA engine,
     // serialize it to mModelStream object (which can be written to a file), then
     // deserialize mModelStream with a IRuntime object to recreate the original engine.
     // Note for this sample we could have simply used the original engine produced by builder->buildEngineWithConfig()
-    auto modelStream = SampleUniquePtr<nvinfer1::IHostMemory>(builder->buildCudaEngine(*network)->serialize());
+    auto modelStream
+        = SampleUniquePtr<nvinfer1::IHostMemory>(builder->buildEngineWithConfig(*network, *config)->serialize());
     assert(modelStream != nullptr);
 
     auto runtime = SampleUniquePtr<nvinfer1::IRuntime>(nvinfer1::createInferRuntime(gLogger.getTRTLogger()));
diff --git a/samples/opensource/sampleReformatFreeIO/CMakeLists.txt b/samples/opensource/sampleReformatFreeIO/CMakeLists.txt
new file mode 100755
index 00000000..c9d1ee83
--- /dev/null
+++ b/samples/opensource/sampleReformatFreeIO/CMakeLists.txt
@@ -0,0 +1,22 @@
+#
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+SET(SAMPLE_SOURCES
+    sampleReformatFreeIO.cpp
+)
+
+set(SAMPLE_PARSERS "caffe")
+
+include(../../CMakeSamplesTemplate.txt)
diff --git a/samples/opensource/sampleReformatFreeIO/README.md b/samples/opensource/sampleReformatFreeIO/README.md
new file mode 100755
index 00000000..46674974
--- /dev/null
+++ b/samples/opensource/sampleReformatFreeIO/README.md
@@ -0,0 +1,94 @@
+# Specifying I/O Formats Using The Reformat Free I/O APIs
+
+
+**Table Of Contents**
+- [Description](#description)
+- [How does this sample work?](#how-does-this-sample-work)
+- [Running the sample](#running-the-sample)
+	* [Sample `--help` options](#sample---help-options)
+- [Additional resources](#additional-resources)
+- [License](#license)
+- [Changelog](#changelog)
+- [Known issues](#known-issues)
+
+## Description
+
+This sample, sampleReformatFreeIO, uses a Caffe model that was trained on the [MNIST dataset](https://github.com/NVIDIA/DIGITS/blob/master/docs/GettingStarted.md) and performs engine building and inference using TensorRT. The correctness of outputs is then compared to the golden reference. Specifically, it shows how to use reformat free I/O APIs to explicitly specify I/O formats to `TensorFormat::kLINEAR`, `TensorFormat::kCHW2` and `TensorFormat::kHWC8` for Float16 and INT8 precision.
+
+## How does this sample work?
+
+`ITensor::setAllowedFormats` is invoked to specify which format is expected to be supported so that the unnecessary reformatting will not be inserted to convert from/to FP32 formats for I/O tensors. `BuilderFlag::kSTRICT_TYPES` is also assigned to the builder configuration to let the builder choose a reformat free path rather than the fastest path.
+
+**Note:** If the reformat free path is not implemented, then the fastest path with reformatting will be selected with the following warning message:
+`Warning: no implementation obeys reformatting-free rules, ....`
+
+	```
+	bool SampleReformatFreeIO::build(int dataWidth)
+	{
+		...
+
+		network->getInput(0)->setAllowedFormats(static_cast<TensorFormats>(1 << static_cast<int>(mTensorFormat)));
+		network->getOutput(0)->setAllowedFormats(static_cast<TensorFormats>(1 << static_cast<int>(mTensorFormat)));
+		...
+		config->setFlag(BuilderFlag::kSTRICT_TYPES);
+		...
+	}
+	```
+
+## Running the sample
+
+1.  Compile this sample by running `make` in the `<TensorRT root directory>/samples/sampleReformatFreeIO` directory. The binary named `sample_reformat_free_io` will be created in the `<TensorRT root directory>/bin` directory.
+	```
+	cd <TensorRT root directory>/samples/sampleReformatFreeIO
+	make
+	```
+
+	Where `<TensorRT root directory>` is where you installed TensorRT.
+
+2.  Run inference on the digit looping from 0 to 9:
+    `./sample_reformat_free_io`
+
+3.  Verify that all 10 digits match correctly. If the sample runs successfully, you should see output similar to the following:
+	```
+	&&&& RUNNING TensorRT.sample_reformat_free_io # ./sample_reformat_free_io
+	[I] The test chooses MNIST as the network and recognizes a randomly generated digit
+	[I] Firstly it runs the FP32 as the golden data, then INT8/FP16 with different formats will be tested
+	[I]
+	[I] Building and running a FP32 GPU inference to get golden input/output
+	[I] [TRT] Detected 1 input and 1 output network tensors.
+	[I] Input:
+	... (omitted message)
+	&&&& PASSED TensorRT.sample_reformat_free_io
+	```
+	This output shows that the sample ran successfully; `PASSED`.
+
+### Sample `--help` options
+
+To see the full list of available options and their descriptions, use the `./sample_reformat_free_io --help` command.
+
+## Additional resources
+
+The following resources provide a deeper understanding about this sample:
+
+**Models**
+- [MNIST](https://keras.io/datasets/#mnist-database-of-handwritten-digits)
+
+**Documentation**
+- [Introduction To NVIDIA’s TensorRT Samples](https://docs.nvidia.com/deeplearning/sdk/tensorrt-sample-support-guide/index.html#samples)
+- [Working With TensorRT Using The C++ API](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#c_topics)
+- [NVIDIA’s TensorRT Documentation Library](https://docs.nvidia.com/deeplearning/sdk/tensorrt-archived/index.html)
+
+## License
+
+For terms and conditions for use, reproduction, and distribution, see the [TensorRT Software License Agreement](https://docs.nvidia.com/deeplearning/sdk/tensorrt-sla/index.html) documentation.
+
+
+## Changelog
+
+June 2019
+This is the first release of the `README.md` file and sample.
+
+
+## Known issues
+
+There are no known issues in this sample.
diff --git a/samples/opensource/sampleReformatFreeIO/sampleReformatFreeIO.cpp b/samples/opensource/sampleReformatFreeIO/sampleReformatFreeIO.cpp
new file mode 100644
index 00000000..1348af99
--- /dev/null
+++ b/samples/opensource/sampleReformatFreeIO/sampleReformatFreeIO.cpp
@@ -0,0 +1,769 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+//! \file SampleReformatFreeIO.cpp
+//! \brief This file contains the implementation of the reformat free I/O sample.
+//!
+//! It builds a TensorRT engine by constructing a conv layer. It uses the engine to run
+//! a conv layer with random input and weights.
+//! The goal of this sample is to show how to specify allowed I/O formats.
+//! It can be run with the following command line:
+//! Command: ./sample_reformat_free_io
+
+#include "argsParser.h"
+#include "buffers.h"
+#include "common.h"
+#include "half.h"
+#include "logger.h"
+
+#include "NvCaffeParser.h"
+#include "NvInfer.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cuda_runtime_api.h>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+
+#include <array>
+#include <random>
+#include <string>
+#include <utility>
+#include <vector>
+
+const std::string gSampleName = "TensorRT.sample_reformat_free_io";
+
+int divUp(int a, int b)
+{
+    return (a + b - 1) / b;
+}
+
+template <typename T>
+std::shared_ptr<T> mallocCudaMem(size_t nbElems)
+{
+    T* ptr = nullptr;
+    CHECK(cudaMalloc((void**) &ptr, sizeof(T) * nbElems));
+    return std::shared_ptr<T>(ptr, [](T* p) { CHECK(cudaFree(p)); });
+}
+
+class BufferDesc
+{
+public:
+    BufferDesc() = default;
+
+    BufferDesc(nvinfer1::Dims dims, int dataWidth, TensorFormat format)
+    {
+        this->dataWidth = dataWidth;
+        if (format == TensorFormat::kLINEAR)
+        {
+            this->dims[0] = dims.d[0];
+            this->dims[1] = dims.d[1];
+            this->dims[2] = dims.d[2];
+            this->dims[3] = 1;
+        }
+        else if (format == TensorFormat::kCHW2)
+        {
+            this->dims[0] = divUp(dims.d[0], 2);
+            this->dims[1] = dims.d[1];
+            this->dims[2] = dims.d[2];
+            this->dims[3] = 2;
+            this->scalarPerVector = 2;
+        }
+        else if (format == TensorFormat::kCHW4)
+        {
+            this->dims[0] = divUp(dims.d[0], 4);
+            this->dims[1] = dims.d[1];
+            this->dims[2] = dims.d[2];
+            this->dims[3] = 4;
+            this->scalarPerVector = 4;
+        }
+        else if (format == TensorFormat::kCHW32)
+        {
+            this->dims[0] = divUp(dims.d[0], 32);
+            this->dims[1] = dims.d[1];
+            this->dims[2] = dims.d[2];
+            this->dims[3] = 32;
+            this->scalarPerVector = 32;
+        }
+        else if (format == TensorFormat::kHWC8)
+        {
+            this->dims[0] = dims.d[1];
+            this->dims[1] = dims.d[2];
+            this->dims[2] = divUp(dims.d[0], 8) * 8;
+            this->dims[3] = 1;
+            this->scalarPerVector = 8;
+            this->channelPivot = true;
+        }
+    }
+
+    // [(C+x-1)/x][H][W][x]
+    // or
+    // [H][W][(C+x-1)/x*x][1]
+    int dims[4] = {1, 1, 1, 1};
+    int dataWidth = 1;
+    int scalarPerVector = 1;
+
+    bool channelPivot = false;
+
+    int getElememtSize()
+    {
+        return dims[0] * dims[1] * dims[2] * dims[3];
+    }
+    int getBufferSize()
+    {
+        return getElememtSize() * dataWidth;
+    }
+};
+
+class SampleBuffer
+{
+public:
+    SampleBuffer()
+    {
+        dims.d[0] = 1;
+        dims.d[1] = 1;
+        dims.d[2] = 1;
+    }
+
+    SampleBuffer(nvinfer1::Dims dims, int dataWidth, TensorFormat format)
+        : dims(dims)
+        , dataWidth(dataWidth)
+        , format(format)
+        , desc(dims, dataWidth, format)
+    {
+        if (nullptr == buffer)
+        {
+            buffer = new uint8_t[getBufferSize()]();
+        }
+    }
+
+    ~SampleBuffer()
+    {
+        destroy();
+    }
+
+    SampleBuffer& operator=(SampleBuffer&& sampleBuffer) noexcept
+    {
+        destroy();
+
+        this->dims = sampleBuffer.dims;
+        this->dataWidth = sampleBuffer.dataWidth;
+        this->desc = sampleBuffer.desc;
+        this->format = sampleBuffer.format;
+        this->buffer = sampleBuffer.buffer;
+        sampleBuffer.buffer = nullptr;
+
+        return *this;
+    }
+
+    void destroy()
+    {
+        if (buffer != nullptr)
+        {
+            delete[] buffer;
+            buffer = nullptr;
+        }
+    }
+
+    nvinfer1::Dims dims;
+
+    int dataWidth{1};
+
+    TensorFormat format{TensorFormat::kLINEAR};
+
+    BufferDesc desc;
+
+    uint8_t* buffer = nullptr;
+
+    int getBufferSize()
+    {
+        return desc.getBufferSize();
+    }
+};
+
+//!
+//! \brief  The SampleReformatFreeIO class implements the reformat free I/O sample
+//!
+//! \details It creates the network using a single conv layer
+//!
+class SampleReformatFreeIO
+{
+    template <typename T>
+    using SampleUniquePtr = std::unique_ptr<T, samplesCommon::InferDeleter>;
+
+public:
+    SampleReformatFreeIO(const samplesCommon::CaffeSampleParams& params)
+        : mParams(params)
+    {
+    }
+
+    //!
+    //! \brief Builds the network engine
+    //!
+    bool build(int dataWidth);
+
+    //!
+    //! \brief Runs the TensorRT inference engine for this sample
+    //!
+    bool infer(SampleBuffer& inputBuf, SampleBuffer& outputBuf);
+
+    //!
+    //! \brief Used to clean up any state created in the sample class
+    //!
+    bool teardown();
+
+    //!
+    //! \brief Used to run CPU reference and get result
+    //!
+    bool reference();
+
+    //!
+    //! \brief Used to compare the CPU reference with the TRT result
+    //!
+    void compareResult();
+
+    //!
+    //! \brief Reads the digit map from the file
+    //!
+    bool readDigits(SampleBuffer& buffer, int groundTruthDigit);
+
+    //!
+    //! \brief Verifies that the output is correct and prints it
+    //!
+    template <typename T>
+    bool verifyOutput(SampleBuffer& outputBuf, int groundTruthDigit) const;
+
+private:
+    //!
+    //! \brief uses a Caffe parser to create the single layer Network and marks the
+    //!        output layers
+    //!
+    void constructNetwork(
+        SampleUniquePtr<nvcaffeparser1::ICaffeParser>& parser, SampleUniquePtr<nvinfer1::INetworkDefinition>& network);
+
+    std::shared_ptr<nvinfer1::ICudaEngine> mEngine{nullptr}; //!< The TensorRT engine used to run the network
+
+public:
+    samplesCommon::CaffeSampleParams mParams;
+
+    nvinfer1::Dims mInputDims; //!< The dimensions of the input to the network.
+
+    nvinfer1::Dims mOutputDims; //!< The dimensions of the output to the network.
+
+    TensorFormat mTensorFormat{TensorFormat::kLINEAR};
+
+    SampleUniquePtr<nvcaffeparser1::IBinaryProtoBlob> mMeanBlob;
+
+    int mDigit;
+};
+
+//!
+//! \brief Creates the network, configures the builder and creates the network engine
+//!
+//! \details This function creates the single layer network by manual insertion and builds
+//!          the engine
+//!
+//! \return Returns true if the engine was created successfully and false otherwise
+//!
+bool SampleReformatFreeIO::build(int dataWidth)
+{
+    auto builder = SampleUniquePtr<nvinfer1::IBuilder>(nvinfer1::createInferBuilder(gLogger.getTRTLogger()));
+    if (!builder)
+    {
+        return false;
+    }
+
+    auto network = SampleUniquePtr<nvinfer1::INetworkDefinition>(builder->createNetwork());
+    if (!network)
+    {
+        return false;
+    }
+
+    auto parser = SampleUniquePtr<nvcaffeparser1::ICaffeParser>(nvcaffeparser1::createCaffeParser());
+    if (!parser)
+    {
+        return false;
+    }
+
+    auto config = SampleUniquePtr<nvinfer1::IBuilderConfig>(builder->createBuilderConfig());
+    if (!config)
+    {
+        return false;
+    }
+
+    constructNetwork(parser, network);
+
+    network->getInput(0)->setAllowedFormats(static_cast<TensorFormats>(1 << static_cast<int>(mTensorFormat)));
+    network->getOutput(0)->setAllowedFormats(static_cast<TensorFormats>(1 << static_cast<int>(mTensorFormat)));
+
+    builder->setMaxBatchSize(1);
+
+    mEngine.reset();
+
+    config->setMaxWorkspaceSize(256_MiB);
+    if (dataWidth == 1)
+    {
+        config->setFlag(BuilderFlag::kINT8);
+        network->getInput(0)->setType(DataType::kINT8);
+        network->getOutput(0)->setType(DataType::kINT8);
+    }
+    if (dataWidth == 2)
+    {
+        config->setFlag(BuilderFlag::kFP16);
+        network->getInput(0)->setType(DataType::kHALF);
+        network->getOutput(0)->setType(DataType::kHALF);
+    }
+
+    config->setFlag(BuilderFlag::kGPU_FALLBACK);
+    config->setFlag(BuilderFlag::kSTRICT_TYPES);
+
+    mEngine = std::shared_ptr<nvinfer1::ICudaEngine>(
+        builder->buildEngineWithConfig(*network, *config), samplesCommon::InferDeleter());
+
+    if (!mEngine)
+        return false;
+
+    assert(network->getNbInputs() == 1);
+    mInputDims = network->getInput(0)->getDimensions();
+    assert(mInputDims.nbDims == 3);
+
+    assert(network->getNbOutputs() == 1);
+    mOutputDims = network->getOutput(0)->getDimensions();
+    assert(mOutputDims.nbDims == 3);
+
+    return true;
+}
+
+//!
+//! \brief Uses a caffe parser to create the single layer Network and marks the
+//!        output layers
+//!
+void SampleReformatFreeIO::constructNetwork(
+    SampleUniquePtr<nvcaffeparser1::ICaffeParser>& parser, SampleUniquePtr<nvinfer1::INetworkDefinition>& network)
+{
+    const nvcaffeparser1::IBlobNameToTensor* blobNameToTensor = parser->parse(
+        mParams.prototxtFileName.c_str(), mParams.weightsFileName.c_str(), *network, nvinfer1::DataType::kFLOAT);
+
+    for (auto& s : mParams.outputTensorNames)
+    {
+        network->markOutput(*blobNameToTensor->find(s.c_str()));
+    }
+
+    nvinfer1::Dims inputDims = network->getInput(0)->getDimensions();
+    // add mean subtraction to the beginning of the network
+    mMeanBlob
+        = SampleUniquePtr<nvcaffeparser1::IBinaryProtoBlob>(parser->parseBinaryProto(mParams.meanFileName.c_str()));
+    nvinfer1::Weights meanWeights{nvinfer1::DataType::kFLOAT, mMeanBlob->getData(), inputDims.d[1] * inputDims.d[2]};
+    // For this sample, a large range based on the mean data is chosen and applied to the entire network.
+    // The preferred method is use scales computed based on a representative data set
+    // and apply each one individually based on the tensor. The range here is large enough for the
+    // network, but is chosen for example purposes only.
+    float maxMean
+        = samplesCommon::getMaxValue(static_cast<const float*>(meanWeights.values), samplesCommon::volume(inputDims));
+
+    auto mean = network->addConstant(nvinfer1::Dims3(1, inputDims.d[1], inputDims.d[2]), meanWeights);
+    auto meanSub = network->addElementWise(*network->getInput(0), *mean->getOutput(0), ElementWiseOperation::kSUB);
+    network->getLayer(0)->setInput(0, *meanSub->getOutput(0));
+    samplesCommon::setAllTensorScales(network.get(), maxMean / maxMean * 128, 128);
+}
+
+//!
+//! \brief Runs the TensorRT inference engine for this sample
+//!
+//! \details This function is the main execution function of the sample. It allocates
+//!          the buffer, sets inputs, executes the engine, and verifies the output.
+//!
+bool SampleReformatFreeIO::infer(SampleBuffer& inputBuf, SampleBuffer& outputBuf)
+{
+    const auto devInput = mallocCudaMem<uint8_t>(inputBuf.getBufferSize());
+    auto devOutput = mallocCudaMem<uint8_t>(outputBuf.getBufferSize());
+
+    CHECK(cudaMemcpy(devInput.get(), inputBuf.buffer, inputBuf.getBufferSize(), cudaMemcpyHostToDevice));
+
+    auto context = SampleUniquePtr<nvinfer1::IExecutionContext>(mEngine->createExecutionContext());
+    if (!context)
+    {
+        return false;
+    }
+
+    // Create CUDA stream for the execution of this inference.
+    cudaStream_t stream;
+    CHECK(cudaStreamCreate(&stream));
+
+    void* bindings[2] = {devInput.get(), devOutput.get()};
+
+    // Asynchronously enqueue the inference work
+    if (!context->enqueue(1, bindings, stream, nullptr))
+    {
+        return false;
+    }
+
+    // Wait for the work in the stream to complete
+    cudaStreamSynchronize(stream);
+
+    // Release stream
+    cudaStreamDestroy(stream);
+
+    CHECK(cudaMemcpy(outputBuf.buffer, devOutput.get(), outputBuf.getBufferSize(), cudaMemcpyDeviceToHost));
+
+    return true;
+}
+
+//!
+//! \brief Used to clean up any state created in the sample class
+//!
+bool SampleReformatFreeIO::teardown()
+{
+    //! Clean up the libprotobuf files as the parsing is complete
+    //! \note It is not safe to use any other part of the protocol buffers library after
+    //! ShutdownProtobufLibrary() has been called.
+    nvcaffeparser1::shutdownProtobufLibrary();
+    return true;
+}
+
+//!
+//! \brief Reads the digit map from file
+//!
+bool SampleReformatFreeIO::readDigits(SampleBuffer& buffer, int groundTruthDigit)
+{
+    const int inputH = buffer.dims.d[1];
+    const int inputW = buffer.dims.d[2];
+
+    // Read a random digit file
+    std::vector<uint8_t> fileData(inputH * inputW);
+    readPGMFile(
+        locateFile(std::to_string(groundTruthDigit) + ".pgm", mParams.dataDirs), fileData.data(), inputH, inputW);
+
+    // Print ASCII representation of digit
+    gLogInfo << "Input:\n";
+    for (int i = 0; i < inputH * inputW; i++)
+    {
+        gLogInfo << (" .:-=+*#%@"[fileData[i] / 26]) << (((i + 1) % inputW) ? "" : "\n");
+    }
+    gLogInfo << std::endl;
+
+    float* inputBuf = reinterpret_cast<float*>(buffer.buffer);
+
+    for (int i = 0; i < inputH * inputW; i++)
+    {
+        inputBuf[i] = float(fileData[i]);
+    }
+
+    return true;
+}
+
+//!
+//! \brief Verifies that the output is correct and prints it
+//!
+template <typename T>
+bool SampleReformatFreeIO::verifyOutput(SampleBuffer& outputBuf, int groundTruthDigit) const
+{
+    const T* prob = reinterpret_cast<const T*>(outputBuf.buffer);
+
+    // Print histogram of the output distribution
+    gLogInfo << "Output:\n";
+    float val{0.0f};
+    float elem{0.0f};
+    int idx{0};
+    const int kDIGITS = 10;
+
+    for (int i = 0; i < kDIGITS; i++)
+    {
+        elem = static_cast<float>(prob[i]);
+        if (val < elem)
+        {
+            val = elem;
+            idx = i;
+        }
+
+        gLogInfo << i << ": " << std::string(int(std::floor(elem * 10 + 0.5f)), '*') << "\n";
+    }
+    gLogInfo << std::endl;
+
+    return (idx == groundTruthDigit && val > 0.9f);
+}
+
+int calcIndex(SampleBuffer& buffer, int c, int h, int w)
+{
+    int index;
+
+    if (!buffer.desc.channelPivot)
+    {
+        index = c / buffer.desc.dims[3] * buffer.desc.dims[1] * buffer.desc.dims[2] * buffer.desc.dims[3]
+            + h * buffer.desc.dims[2] * buffer.desc.dims[3] + w * buffer.desc.dims[3] + c % buffer.desc.dims[3];
+    }
+    else
+    {
+        index = h * buffer.desc.dims[2] * buffer.desc.dims[1] + w * buffer.desc.dims[2]
+            + c / buffer.desc.scalarPerVector * buffer.desc.scalarPerVector + c % buffer.desc.scalarPerVector;
+    }
+
+    return index;
+}
+
+//!
+//! \brief Reformats the buffer. Src and dst buffers should be of same datatype and dims.
+//!
+template <typename T>
+void reformat(SampleBuffer& src, SampleBuffer& dst)
+{
+    if (src.format == dst.format)
+    {
+        memcpy(dst.buffer, src.buffer, src.getBufferSize());
+        return;
+    }
+
+    int srcIndex, dstIndex;
+
+    T* srcBuf = reinterpret_cast<T*>(src.buffer);
+    T* dstBuf = reinterpret_cast<T*>(dst.buffer);
+
+    for (int c = 0; c < src.dims.d[0]; c++)
+    {
+        for (int h = 0; h < src.dims.d[1]; h++)
+        {
+            for (int w = 0; w < src.dims.d[2]; w++)
+            {
+                srcIndex = calcIndex(src, c, h, w);
+                dstIndex = calcIndex(dst, c, h, w);
+                dstBuf[dstIndex] = srcBuf[srcIndex];
+            }
+        }
+    }
+}
+
+template <typename T>
+void convertGoldenData(SampleBuffer& goldenInput, SampleBuffer& dstInput)
+{
+    SampleBuffer tmpBuf(goldenInput.dims, sizeof(T), goldenInput.format);
+
+    float* golden = reinterpret_cast<float*>(goldenInput.buffer);
+    T* tmp = reinterpret_cast<T*>(tmpBuf.buffer);
+
+    for (int i = 0; i < goldenInput.desc.getElememtSize(); i++)
+    {
+        if (std::is_same<T, int8_t>::value)
+        {
+            tmp[i] = static_cast<T>(golden[i] - 128);
+        }
+        else
+        {
+            tmp[i] = static_cast<T>(golden[i]);
+        }
+    }
+
+    reformat<T>(tmpBuf, dstInput);
+}
+
+//!
+//! \brief Used to randomly initialize buffers
+//!
+void randomInitBuff(SampleBuffer& buffer)
+{
+    srand(time(NULL));
+
+    float* tmpBuf = reinterpret_cast<float*>(buffer.buffer);
+
+    for (int i = 0; i < buffer.getBufferSize() / buffer.dataWidth; i++)
+    {
+        tmpBuf[i] = static_cast<float>((rand() % 256) - 128);
+    }
+}
+
+//!
+//! \brief Initializes members of the params struct using the command line args
+//!
+samplesCommon::CaffeSampleParams initializeSampleParams(const samplesCommon::Args& args)
+{
+    samplesCommon::CaffeSampleParams params;
+    if (args.dataDirs.empty())
+    {
+        params.dataDirs.push_back("data/mnist/");
+        params.dataDirs.push_back("data/samples/mnist/");
+    }
+    else
+    {
+        params.dataDirs = args.dataDirs;
+    }
+
+    params.prototxtFileName = locateFile("mnist.prototxt", params.dataDirs);
+    params.weightsFileName = locateFile("mnist.caffemodel", params.dataDirs);
+    params.meanFileName = locateFile("mnist_mean.binaryproto", params.dataDirs);
+    params.inputTensorNames.push_back("data");
+    params.batchSize = 1;
+    params.outputTensorNames.push_back("prob");
+    params.dlaCore = args.useDLACore;
+
+    return params;
+}
+//!
+//! \brief Prints the help information for running this sample
+//!
+void printHelpInfo()
+{
+    std::cout << "Usage: ./sample_reformat_free_io [-h or --help] [-d or --datadir=<path to data directory>] "
+                 "[--useDLACore=<int>]\n";
+    std::cout << "--help          Display help information\n";
+    std::cout << "--datadir       Specify path to a data directory, overriding the default. This option can be used "
+                 "multiple times to add multiple directories. If no data directories are given, the default is to use "
+                 "data/samples/googlenet/ and data/googlenet/"
+              << std::endl;
+    std::cout << "--useDLACore=N  Specify a DLA engine for layers that support DLA. Value can range from 0 to n-1, "
+                 "where n is the number of DLA engines on the platform."
+              << std::endl;
+}
+
+//!
+//! \brief Used to run the engine build and inference/reference functions
+//!
+template <typename T>
+int process(SampleReformatFreeIO& sample, const Logger::TestAtom& sampleTest, SampleBuffer& inputBuf,
+    SampleBuffer& outputBuf, SampleBuffer& goldenInput, SampleBuffer& goldenOutput)
+{
+    gLogInfo << "Building and running a GPU inference engine for reformat free I/O" << std::endl;
+
+    inputBuf = SampleBuffer(sample.mInputDims, sizeof(T), sample.mTensorFormat);
+    outputBuf = SampleBuffer(sample.mOutputDims, sizeof(T), sample.mTensorFormat);
+
+    if (!sample.build(sizeof(T)))
+    {
+        return gLogger.reportFail(sampleTest);
+    }
+
+    convertGoldenData<T>(goldenInput, inputBuf);
+
+    if (!sample.infer(inputBuf, outputBuf))
+    {
+        return gLogger.reportFail(sampleTest);
+    }
+
+    SampleBuffer linearOutputBuf(sample.mOutputDims, sizeof(T), TensorFormat::kLINEAR);
+
+    reformat<T>(outputBuf, linearOutputBuf);
+
+    if (!sample.verifyOutput<T>(linearOutputBuf, sample.mDigit))
+    {
+        return gLogger.reportFail(sampleTest);
+    }
+
+    return 0;
+}
+
+int runFP32Reference(SampleReformatFreeIO& sample, const Logger::TestAtom& sampleTest, SampleBuffer& goldenInput,
+    SampleBuffer& goldenOutput)
+{
+    gLogInfo << "Building and running a FP32 GPU inference to get golden input/output" << std::endl;
+
+    if (!sample.build(sizeof(float)))
+    {
+        return gLogger.reportFail(sampleTest);
+    }
+
+    goldenInput = SampleBuffer(sample.mInputDims, sizeof(float), TensorFormat::kLINEAR);
+    goldenOutput = SampleBuffer(sample.mOutputDims, sizeof(float), TensorFormat::kLINEAR);
+
+    // randomInitBuff(goldenInput);
+    sample.readDigits(goldenInput, sample.mDigit);
+
+    if (!sample.infer(goldenInput, goldenOutput))
+    {
+        return gLogger.reportFail(sampleTest);
+    }
+
+    if (!sample.verifyOutput<float>(goldenOutput, sample.mDigit))
+    {
+        return gLogger.reportFail(sampleTest);
+    }
+
+    return 0;
+}
+
+int main(int argc, char** argv)
+{
+    samplesCommon::Args args;
+    bool argsOK = samplesCommon::parseArgs(args, argc, argv);
+    if (!argsOK)
+    {
+        gLogError << "Invalid arguments" << std::endl;
+        printHelpInfo();
+        return EXIT_FAILURE;
+    }
+    if (args.help)
+    {
+        printHelpInfo();
+        return EXIT_SUCCESS;
+    }
+
+    auto sampleTest = gLogger.defineTest(gSampleName, argc, argv);
+
+    gLogger.reportTestStart(sampleTest);
+
+    samplesCommon::CaffeSampleParams params = initializeSampleParams(args);
+
+    std::vector<std::pair<TensorFormat, std::string>> vecFP16TensorFmt = {
+        std::make_pair(TensorFormat::kLINEAR, "kLINEAR"),
+        std::make_pair(TensorFormat::kCHW2, "kCHW2"),
+        std::make_pair(TensorFormat::kHWC8, "kHWC8"),
+    };
+    std::vector<std::pair<TensorFormat, std::string>> vecINT8TensorFmt = {
+        std::make_pair(TensorFormat::kLINEAR, "kLINEAR"),
+        std::make_pair(TensorFormat::kCHW4, "kCHW4"),
+        std::make_pair(TensorFormat::kCHW32, "kCHW32"),
+    };
+
+    SampleBuffer goldenInput, goldenOutput;
+
+    SampleReformatFreeIO sample(params);
+
+    srand(unsigned(time(nullptr)));
+    sample.mDigit = rand() % 10;
+
+    gLogInfo << "The test chooses MNIST as the network and recognizes a randomly generated digit" << std::endl;
+    gLogInfo << "Firstly it runs the FP32 as the golden data, then INT8/FP16 with different formats will be tested"
+             << std::endl
+             << std::endl;
+
+    runFP32Reference(sample, sampleTest, goldenInput, goldenOutput);
+
+    // Test INT8 formats
+    for (auto elem : vecINT8TensorFmt)
+    {
+        gLogInfo << "Testing datatype INT8 with format " << elem.second << std::endl;
+        sample.mTensorFormat = elem.first;
+        SampleBuffer inputBuf, outputBuf;
+
+        process<int8_t>(sample, sampleTest, inputBuf, outputBuf, goldenInput, goldenOutput);
+    }
+
+    // Test FP16 formats
+    for (auto elem : vecFP16TensorFmt)
+    {
+        gLogInfo << "Testing datatype FP16 with format " << elem.second << std::endl;
+        sample.mTensorFormat = elem.first;
+        SampleBuffer inputBuf, outputBuf;
+
+        process<half_float::half>(sample, sampleTest, inputBuf, outputBuf, goldenInput, goldenOutput);
+    }
+
+    if (!sample.teardown())
+    {
+        return gLogger.reportFail(sampleTest);
+    }
+
+    return gLogger.reportPass(sampleTest);
+}
diff --git a/samples/opensource/sampleSSD/README.md b/samples/opensource/sampleSSD/README.md
index 0f2a5ee8..9bedc383 100644
--- a/samples/opensource/sampleSSD/README.md
+++ b/samples/opensource/sampleSSD/README.md
@@ -12,7 +12,7 @@
 - [Prerequisites](#prerequisites)
 - [Running the sample](#running-the-sample)
     * [Sample `--help` options](#sample---help-options)
-- [Additional resources](#additonal-resources)
+- [Additional resources](#additional-resources)
 - [License](#license)
 - [Changelog](#changelog)
 - [Known issues](#known-issues)
@@ -72,7 +72,7 @@ To initialize and register these TensorRT plugins to the plugin registry, the `i
 The sampleSSD sample builds a network based on a Caffe model and network description. For details on importing a Caffe model, see [Importing A Caffe Model Using The C++ Parser API](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#import_caffe_c). The SSD network has few non-natively supported layers which are implemented as plugins in TensorRT. The Caffe parser can create plugins for these layers internally using the plugin registry.
 
 This sample can run in FP16 and INT8 modes based on the user input. For more details, see [INT8 Calibration Using C++](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#optimizing_int8_c) and [Enabling FP16 Inference Using C++](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#enable_fp16_c). The sample selects the entropy calibrator as a default choice. The `CalibrationMode` parameter in the sample code needs to be set to `0` to switch to the Legacy calibrator.
- 
+
 For details on how to build the TensorRT engine, see [Building An Engine In C++](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#build_engine_c). After the engine is built, the next steps are to serialize the engine and run the inference with the deserialized engine. For more information about these steps, see [Serializing A Model In C++](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#serial_model_c).
 
 ### Verifying the output
diff --git a/samples/opensource/sampleSSD/batchPrepare.py b/samples/opensource/sampleSSD/batchPrepare.py
index 94b8343d..832f4cbb 100644
--- a/samples/opensource/sampleSSD/batchPrepare.py
+++ b/samples/opensource/sampleSSD/batchPrepare.py
@@ -80,9 +80,9 @@
 	batchfile = outDir + "/batch_calibration" + str(i) + ".batch"
 	batchlistfile = outDir + "/batch_calibration" + str(i) + ".list"
 	batchlist = open(batchlistfile,'a')
-  	batch = np.zeros(shape=(NUM_PER_BATCH, 3, height, width), dtype=np.float32)
+	batch = np.zeros(shape=(NUM_PER_BATCH, 3, height, width), dtype = np.float32)
 	for j in range(NUM_PER_BATCH):
-                batchlist.write(os.path.basename(imgs[img])+'\n')
+		batchlist.write(os.path.basename(imgs[img]) + '\n')
 		im = Image.open(imgs[img]).resize((width,height), Image.NEAREST)
 		in_ = np.array(im, dtype=np.float32, order='C')
 		in_ = in_[:,:,::-1]
diff --git a/samples/opensource/sampleSSD/sampleSSD.cpp b/samples/opensource/sampleSSD/sampleSSD.cpp
index 36d8bde8..133e7aea 100644
--- a/samples/opensource/sampleSSD/sampleSSD.cpp
+++ b/samples/opensource/sampleSSD/sampleSSD.cpp
@@ -97,7 +97,8 @@ class SampleSSD
     //! \brief Parses a Caffe model for SSD and creates a TensorRT network
     //!
     bool constructNetwork(SampleUniquePtr<nvinfer1::IBuilder>& builder,
-        SampleUniquePtr<nvinfer1::INetworkDefinition>& network, SampleUniquePtr<nvcaffeparser1::ICaffeParser>& parser);
+        SampleUniquePtr<nvinfer1::INetworkDefinition>& network, SampleUniquePtr<nvinfer1::IBuilderConfig>& config,
+        SampleUniquePtr<nvcaffeparser1::ICaffeParser>& parser);
 
     //!
     //! \brief Reads the input and mean data, preprocesses, and stores the result in a managed buffer
@@ -134,13 +135,19 @@ bool SampleSSD::build()
         return false;
     }
 
+    auto config = SampleUniquePtr<nvinfer1::IBuilderConfig>(builder->createBuilderConfig());
+    if (!config)
+    {
+        return false;
+    }
+
     auto parser = SampleUniquePtr<nvcaffeparser1::ICaffeParser>(nvcaffeparser1::createCaffeParser());
     if (!parser)
     {
         return false;
     }
 
-    auto constructed = constructNetwork(builder, network, parser);
+    auto constructed = constructNetwork(builder, network, config, parser);
     if (!constructed)
     {
         return false;
@@ -162,7 +169,8 @@ bool SampleSSD::build()
 //! \param builder Pointer to the engine builder
 //!
 bool SampleSSD::constructNetwork(SampleUniquePtr<nvinfer1::IBuilder>& builder,
-    SampleUniquePtr<nvinfer1::INetworkDefinition>& network, SampleUniquePtr<nvcaffeparser1::ICaffeParser>& parser)
+    SampleUniquePtr<nvinfer1::INetworkDefinition>& network, SampleUniquePtr<nvinfer1::IBuilderConfig>& config,
+    SampleUniquePtr<nvcaffeparser1::ICaffeParser>& parser)
 {
     const nvcaffeparser1::IBlobNameToTensor* blobNameToTensor
         = parser->parse(locateFile(mParams.prototxtFileName, mParams.dataDirs).c_str(),
@@ -174,10 +182,29 @@ bool SampleSSD::constructNetwork(SampleUniquePtr<nvinfer1::IBuilder>& builder,
     }
 
     builder->setMaxBatchSize(mParams.batchSize);
-    builder->setMaxWorkspaceSize(36_MB);
-    samplesCommon::enableDLA(builder.get(), mParams.dlaCore);
+    config->setMaxWorkspaceSize(36_MiB);
+    if (mParams.fp16)
+    {
+        config->setFlag(BuilderFlag::kFP16);
+    }
+    samplesCommon::enableDLA(builder.get(), config.get(), mParams.dlaCore);
+
+    // Calibrator life time needs to last until after the engine is built.
+    std::unique_ptr<IInt8Calibrator> calibrator;
+
+    if (mParams.int8)
+    {
+        gLogInfo << "Using Entropy Calibrator 2" << std::endl;
+        BatchStream calibrationStream(
+            mParams.batchSize, mParams.nbCalBatches, mParams.calibrationBatches, mParams.dataDirs);
+        calibrator.reset(
+            new Int8EntropyCalibrator2<BatchStream>(calibrationStream, 0, "SSD", mParams.inputTensorNames[0].c_str()));
+        config->setFlag(BuilderFlag::kINT8);
+        config->setInt8Calibrator(calibrator.get());
+    }
 
-    mEngine = std::shared_ptr<nvinfer1::ICudaEngine>(builder->buildCudaEngine(*network), samplesCommon::InferDeleter());
+    mEngine = std::shared_ptr<nvinfer1::ICudaEngine>(
+        builder->buildEngineWithConfig(*network, *config), samplesCommon::InferDeleter());
     if (!mEngine)
     {
         return false;
@@ -351,6 +378,8 @@ SampleSSDParams initializeSampleParams(const samplesCommon::Args& args)
     {
         params.dataDirs.push_back("data/ssd/");
         params.dataDirs.push_back("data/samples/ssd/");
+        params.dataDirs.push_back("data/int8_samples/ssd/");
+        params.dataDirs.push_back("int8/ssd/");
     }
     else //!< Use the data directory provided by the user
     {
@@ -363,6 +392,8 @@ SampleSSDParams initializeSampleParams(const samplesCommon::Args& args)
     params.outputTensorNames.push_back("detection_out");
     params.outputTensorNames.push_back("keep_count");
     params.dlaCore = args.useDLACore;
+    params.int8 = args.runInInt8;
+    params.fp16 = args.runInFp16;
 
     params.outputClsSize = 21;
     params.keepTopK = 200; // Number of total bboxes to be kept per image after NMS step. It is same as
@@ -389,6 +420,8 @@ void printHelpInfo()
     std::cout << "--useDLACore=N  Specify a DLA engine for layers that support DLA. Value can range from 0 to n-1, "
                  "where n is the number of DLA engines on the platform."
               << std::endl;
+    std::cout << "--fp16          Specify to run in fp16 mode." << std::endl;
+    std::cout << "--int8          Specify to run in int8 mode." << std::endl;
 }
 
 int main(int argc, char** argv)
diff --git a/samples/opensource/sampleUffFasterRCNN/CMakeLists.txt b/samples/opensource/sampleUffFasterRCNN/CMakeLists.txt
new file mode 100644
index 00000000..897b9947
--- /dev/null
+++ b/samples/opensource/sampleUffFasterRCNN/CMakeLists.txt
@@ -0,0 +1,23 @@
+#
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+SET(SAMPLE_SOURCES
+    sampleUffFasterRCNN.cpp
+)
+
+set(SAMPLE_PARSERS "uff")
+set(PLUGINS_NEEDED ON)
+
+include(../../CMakeSamplesTemplate.txt)
diff --git a/samples/opensource/sampleUffFasterRCNN/README.md b/samples/opensource/sampleUffFasterRCNN/README.md
new file mode 100644
index 00000000..6debb275
--- /dev/null
+++ b/samples/opensource/sampleUffFasterRCNN/README.md
@@ -0,0 +1,234 @@
+# Object Detection With A TensorFlow FasterRCNN Network
+
+**Table Of Contents**
+- [Description](#description)
+- [How does this sample work?](#how-does-this-sample-work)
+    * [Processing the input graph](#processing-the-input-graph)
+    * [Preparing the data](#preparing-the-data)
+    * [sampleUffFasterRCNN plugins](#sampleufffasterrcnn-plugins)
+    * [Verifying the output](#verifying-the-output)
+	* [TensorRT API layers and ops](#tensorrt-api-layers-and-ops)
+- [Prerequisites](#prerequisites)
+- [Running the sample](#running-the-sample)
+	* [Sample `--help` options](#sample---help-options)
+- [Additional resources](#additional-resources)
+- [License](#license)
+- [Changelog](#changelog)
+- [Known issues](#known-issues)
+
+## Description
+
+This sample, sampleUffFasterRCNN, is a UFF TensorRT sample for Faster-RCNN in [NVIDIA Transfer Learning Toolkit SDK](https://developer.nvidia.com/transfer-learning-toolkit). This sample serves as a demo of how to use pretrained Faster-RCNN model in Transfer Learning Toolkit to do inference with TensorRT. Besides the sample itself, it also provides two TensorRT plugins: `Proposal` and `CropAndResize` to implement the proposal layer and ROIPooling layer as custom layers in the model since TensorRT has no native support for them.
+
+## How does this sample work?
+
+The UFF Faster R-CNN network performs the task of object detection and localization in a single forward pass of the network. The UFF Faster R-CNN network was trained on the ResNet-18 backbone (feature extractor) to detect 4 classes of objects: `Automobile`, `Roadsign`, `Bicycle` and `Person` along with the `background` class(nothing).
+
+This sample makes use of TensorRT plugins to run the UFF Faster R-CNN network. To use these plugins, the TensorFlow graph needs to be preprocessed, and we use the GraphSurgeon utility to do this.
+
+The main components of this network are the Image Preprocessor, FeatureExtractor, Region Proposal Network (RPN), Proposal, ROIPooling (CropAndResize), Classifier and Postprocessor.
+  
+In this sample, we provide a UFF model as a demo. While in the Transfer Learning Toolkit workflow, we can't provide the UFF model. Instead, we can only get the `.tlt` model during training and the `.etlt` model after `tlt-export`. Both of them are encrypted models and the Transfer Learning Toolkit user will use `tlt-converter` to decrypt the `.etlt` model and generate a TensorRT engine file in a single step. Therefore, in the Transfer Learning Toolkit workflow, we will consume the TensorRT engine instead of a UFF model. However, this sample can still serve as a demo on how to use the UFF Faster R-CNN model regardless of its format.
+
+**Image Preprocessor**
+The image preprocessor step of the graph is responsible for resizing the image. The image is resized to a 3x272x480(CHW) size tensor. This step also performs per-channel mean value subtraction of the images. After preprocessing, the input images's channel order is `BGR` instead of `RGB`.
+
+**FeatureExtractor**
+The FeatureExtractor portion of the graph runs the ResNet18 network on the preprocessed image. The feature maps generated are used by the RPN layer and the Proposal layer to generate the Region of Interests (ROIs) that may contain objects. As a second branch, the feature maps are also used in the ROIPooling (or more precisely, CropAndResize layer) to crop out the patches from the feature maps with the specified ROIs output from Proposal layer.
+
+In this network, the feature maps come from an intermediate layer's output in the ResNet-18 backbone. The intermediate layer has a cumulative stride of 16.
+
+**Region Proposal Network (RPN)**
+The RPN takes the feature maps from the stride-16 backbone and append a small Convolutional Neural Network (CNN) head after it to detect whether a specific region of the image has object or not. It also outputs a rough coordinates of the candidate object.
+
+**Proposal**
+The Proposal layer takes the input of the RPN and do some refinement of the candidate boxes from the RPN. The refinement includes taking the top boxes that has the highest confidence and do NMS (non-maximum suppression) against them. Finally, taking the top boxes again according to their confidence after NMS operation.
+
+This operation is implemented in the `Proposal` plugin as a TensorRT plugin.
+
+**CropAndResize**
+The CropAndResize layer performs a TensorFlow implementation of the original ROIPooling layer in the Caffe implementation. The CropAndResize layer resizes the ROIs from the Proposal layer to a common target size and the output results are followed by a classifier to distinguish which class the ROI belongs to. The difference between the CropAndResize operation and the ROIPooling operation is the former use bilinear interpolation while the latter uses pooling.
+
+This operation is implemented in the `CropAndResize` plugin as a TensorRT plugin.
+
+**Classifier**
+The classifier is a small network that takes the output of the CropAndResize layer as input and distinguish which class the ROI belongs to. Apart from that, it also gives a delta coordinates to refine the coordinates output from the RPN layer.
+
+**Postprocessor**
+The Postprocessor applies the delta values from the classifier output to the coordinates from the RPN output and do NMS after that to get the final detection results.
+
+Specifically, this sample performs the following steps:
+- [Processing the input graph](#processing-the-input-graph)
+- [Preparing the data](#preparing-the-data)
+- [sampleUffFasterRCNN plugins](#sampleufffasterrcnn-plugins)
+- [Verifying the output](#verifying-the-output)
+
+
+### Processing the input graph
+
+The TensorFlow FasterRCNN graph has some operations that are currently not supported in TensorRT. Using a preprocessor on the graph, we can combine multiple operations in the graph into a single custom operation which can be implemented as a plugin layer in TensorRT. Currently, the preprocessor provides the ability to stitch all nodes within a namespace into one custom node.
+  
+To use the preprocessor, the `convert-to-uff` utility should be called with a `-p` flag and a config file. The config script should also include attributes for all custom plugins which will be embedded in the generated `.uff` file. Current sample script for UFF Faster R-CNN is located in `config.py` in this sample.
+
+### Preparing the data
+
+The generated network has an input node called `input_1`, and the output nodes's names are `dense_class/Softmax`, `dense_regress/BiasAdd` and `proposal`. These nodes are registered by the UFF Parser in the sample.
+
+The input to the UFF Faster R-CNN network in this sample is 3 channel 480x272 images. In the sample, we subtract the per-channel mean values for the input images.
+
+Since TensorRT does not depend on any computer vision libraries, the images are represented in binary R, G, and B values for each pixel. The format is Portable PixMap (PPM), which is a netpbm color image format. In this format, the R, G, and B values for each pixel are represented by a byte of integer (0-255) and they are stored together, pixel by pixel. The channel order of the input image is actually BGR instead of RGB due to implementation.
+  
+There is a simple PPM reading function called `readPPMFile`.
+
+### sampleUffFasterRCNN plugins
+
+Details about how to create TensorRT plugins can be found in [Extending TensorRT With Custom Layers](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#extending).
+
+The `config.py` defined for the`convert-to-uff` command should have the custom layers mapped to the plugin names in TensorRT by modifying the op field. The names of the plugin parameters should also exactly match those expected by the TensorRT plugins.
+
+If the `config.py` is defined as above, the NvUffParser will be able to parse the network and call the appropriate plugins with the correct parameters.
+
+Details about some of the plugin layers implemented for UFF Faster R-CNN in TensorRT are given below.
+
+**CropAndResize plugin**
+The `CropAndResize` plugin crops out patches from the feature maps according to the ROI coordinates from the Proposal layer and resizes them to a common target size, for example, 7x7. The output tensor is used as input of the classifier that follows `CropAndResize` plugin.
+
+**Proposal plugin**
+The `Proposal` plugin does the refinement of the candidate boxes from the RPN. The refinement includes selecting the top boxes according to their confidence, doing NMS and finally selecting the top boxes that has the highest confidence after NMS.
+
+### Verifying the output
+
+After the builder is created (see [Building An Engine In C++](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#build_engine_c)) and the engine is serialized (see [Serializing A Model In C++](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#serial_model_c)), we can perform inference. Steps for deserialization and running inference are outlined in [Performing Inference In C++](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#perform_inference_c). The outputs of the UFF FasterRCNN network are human interpretable. The results are visualized by drawing the bounding boxes on the images.
+
+### TensorRT API layers and ops
+
+In this sample, the following layers are used. For more information about these layers, see the [TensorRT Developer Guide: Layers](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#layers) documentation.
+
+[Activation layer](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#activation-layer)
+The Activation layer implements element-wise activation functions. Specifically, this sample uses the Activation layer with the type `kRELU`.
+
+[Convolution layer](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#convolution-layer)
+The Convolution layer computes a 2D (channel, height, and width) convolution, with or without bias.
+
+[FullyConnected layer](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#fullyconnected-layer)
+The FullyConnected layer implements a matrix-vector product, with or without bias.
+
+[Padding layer](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#padding-layer)
+The IPaddingLayer implements spatial zero-padding of tensors along the two innermost dimensions.
+
+[Plugin layer](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#plugin-layer)
+Plugin layers are user-defined and provide the ability to extend the functionalities of TensorRT. See [Extending TensorRT With Custom Layers](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#extending) for more details.
+
+[Pooling layer](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#pooling-layer)
+The Pooling layer implements pooling within a channel. Supported pooling types are `maximum`, `average` and `maximum-average blend`.
+
+[Scale layer](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#scale-layer)
+The Scale layer implements a per-tensor, per-channel, or per-element affine transformation and/or exponentiation by constant values.
+
+[SoftMax layer](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#softmax-layer)
+The SoftMax layer applies the SoftMax function on the input tensor along an input dimension specified by the user.
+
+## Prerequisites
+1. Install the UFF toolkit and graph surgeon; depending on your TensorRT installation method, to install the toolkit and graph surgeon, choose the method you used to install TensorRT for instructions (see [TensorRT Installation Guide: Installing TensorRT](https://docs.nvidia.com/deeplearning/sdk/tensorrt-install-guide/index.html#installing)).  
+    
+2. We provide a bash script to download the model as well as other data required for this sample: `./download_model.sh`.
+
+   The model is downloaded and unzipped in the directory `uff_faster_rcnn` and the `pb` model is `uff_faster_rcnn/faster_rcnn.pb`.
+
+   Along with the `pb` mode there are some PPM images and a `list.txt` in the directory. These PPM images are the test images used in this sample. The `list.txt` is used in the INT8 mode for listing the image names used in INT8 calibration step in TensorRT.
+
+3. Perform preprocessing on the TensorFlow model using the UFF converter.  
+	1.  Copy the TensorFlow protobuf file (`faster_rcnn.pb`) from the downloaded directory in the previous step to the working directory (for example `/usr/src/tensorrt/data/faster-rcnn-uff`).
+
+	2.  Patch the UFF converter.
+
+		Apply a patch to the UFF converter to fix an issue with the Softmax layer in the UFF package. Let `UFF_ROOT` denotes the root directory of the Python UFF package, for example, `/usr/lib/python2.7/dist-packages/uff`
+
+		Then, apply the patch with the following command:
+		`patch UFF_ROOT/converters/tensorflow/converter_functions.py < fix_sofmax.patch` 
+
+		The patch file `fix_softmax.patch` is generated using the UFF package version 0.6.3 in TensorRT 5.1 GA. Ensure your UFF package version is also 0.6.3 before applying the patch. For TensorRT 6.0, feel free to ignore this since it should already be fixed.
+
+	3.  Run the following command for the conversion.
+		```
+		convert-to-uff -p config.py -O dense_class/Softmax -O dense_regress/BiasAdd -O proposal faster_rcnn.pb  
+		```  
+		This saves the converted `.uff` file in the same directory as the input with the name `faster_rcnn.uff`.  
+  
+		The `config.py` script specifies the preprocessing operations necessary for the UFF Faster R-CNN TensorFlow graph. The plugin nodes and plugin parameters used in the `config.py` script should match the registered plugins in TensorRT.  
+
+4. The sample also requires a `list.txt` file with a list of all the calibration images (basename, without suffix) when running in INT8 mode. Copy the `list.txt` to the same directory that contains the `pb` model.  
+
+5. Copy the PPM images in the data directory the same directory that contains the `pb` model.
+
+
+## Running the sample
+
+1. Following the [top level guide](../../../README.md) to build the OSS samples(including this sample, of course). The binary named `sample_uff_fasterRCNN` will be created in the `build/cmake/out` directory.
+
+2. Run the sample to perform object detection and localization.
+
+   To run the sample in FP32 mode:
+	```
+	./sample_uff_fasterRCNN --datadir /data/uff_faster_rcnn -W 480 -H 272 -I 2016_1111_185016_003_00001_night_000441.ppm
+	```
+
+   To run the sample in INT8 mode:
+	```
+	./sample_uff_fasterRCNN --datadir /data/uff_faster_rcnn -i -W 480 -H 272 -I 2016_1111_185016_003_00001_night_000441.ppm
+	```
+	
+3. Verify that the sample ran successfully. If the sample runs successfully you should see output similar to the following:
+	```
+    Detected Automobile in 2016_1111_185016_003_00001_night_000441.ppm with confidence 99.9734%
+    Detected Automobile in 2016_1111_185016_003_00001_night_000441.ppm with confidence 99.9259%
+    Detected Automobile in 2016_1111_185016_003_00001_night_000441.ppm with confidence 98.7359%
+    Detected Automobile in 2016_1111_185016_003_00001_night_000441.ppm with confidence 92.4371%
+    Detected Automobile in 2016_1111_185016_003_00001_night_000441.ppm with confidence 89.7888%
+	```
+   This output shows that the sample ran successfully; `PASSED`.
+
+
+### Sample `--help` options
+
+To see the full list of available options and their descriptions, use the `-h` or `--help` command line option.  
+```
+Usage: ./sample_uff_fasterRCNN --datadir /data/uff_faster_rcnn -h
+--help[-h] Display help information
+--datadir[-d] Specify path to a data directory, overriding the default. This option can be repeated to add multiple directories. If the option is unspecified, the default is to search data/faster-rcnn/ and data/samples/faster-rcnn/.
+--useDLACore[-u] Specify a DLA engine for layers that support DLA. Value can range from 0 to n-1, where n is the number of DLA engines on the platform.
+--fp16[-f] Specify to run in fp16 mode.
+--int8[-i] Specify to run in int8 mode.
+--inputWidth[-W] Specify the input width of the model.
+--inputHeight[-H] Specify the input height of the model.
+--batchSize[-B] Specify the batch size for inference.
+--profile[-p] Whether to do per-layer profiling.
+--repeat[-r] Specify the repeat number to execute the TRT context, used to smooth the profiling time.
+--inputImages[-I] Specify the input images for inference.
+--saveEngine[-s] Path to save engine.
+--loadEngine[-l] Path to load engine.
+```
+
+# Additional resources
+
+The following resources provide a deeper understanding about sampleUffFasterRCNN.
+
+**Documentation**
+- [Introduction To NVIDIA’s TensorRT Samples](https://docs.nvidia.com/deeplearning/sdk/tensorrt-sample-support-guide/index.html#samples)
+- [Working With TensorRT Using The C++ API](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#c_topics)
+- [NVIDIA’s TensorRT Documentation Library](https://docs.nvidia.com/deeplearning/sdk/tensorrt-archived/index.html)
+
+# License
+
+For terms and conditions for use, reproduction, and distribution, see the [TensorRT Software License Agreement](https://docs.nvidia.com/deeplearning/sdk/tensorrt-sla/index.html) documentation.
+
+
+# Changelog
+
+July 2019
+This is the first release of the `README.md` file and sample.
+
+
+# Known issues
+
+There are no known issues in this sample.
diff --git a/samples/opensource/sampleUffFasterRCNN/config.py b/samples/opensource/sampleUffFasterRCNN/config.py
new file mode 100644
index 00000000..b4471222
--- /dev/null
+++ b/samples/opensource/sampleUffFasterRCNN/config.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import tensorflow as tf
+import graphsurgeon as gs
+
+
+CropAndResize = gs.create_plugin_node(name='roi_pooling_conv_1/CropAndResize_new', op="CropAndResize", inputs=['activation_7/Relu', 'proposal'], crop_height=7, crop_width=7)
+Proposal = gs.create_plugin_node(name='proposal', op='Proposal', inputs=['rpn_out_class/Sigmoid', 'rpn_out_regress/BiasAdd'],     input_height=272, input_width=480, rpn_stride=16, roi_min_size=1.0, nms_iou_threshold=0.7, pre_nms_top_n=6000, post_nms_top_n=300, anchor_sizes=[32.0, 64.0, 128.0], anchor_ratios=[1.0, 0.5, 2.0])
+
+
+namespace_plugin_map = {
+"crop_and_resize_1/Reshape" : CropAndResize,
+'crop_and_resize_1/CropAndResize' : CropAndResize,
+"crop_and_resize_1/transpose" : CropAndResize,
+"crop_and_resize_1/transpose_1" : CropAndResize
+}
+
+
+def preprocess(dynamic_graph):
+    # Now create a new graph by collapsing namespaces
+    dynamic_graph.append(Proposal)
+    dynamic_graph.remove('input_2')
+    dynamic_graph.collapse_namespaces(namespace_plugin_map)
+
diff --git a/samples/opensource/sampleUffFasterRCNN/download_model.sh b/samples/opensource/sampleUffFasterRCNN/download_model.sh
new file mode 100755
index 00000000..331a5313
--- /dev/null
+++ b/samples/opensource/sampleUffFasterRCNN/download_model.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+#
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+set -eo pipefail
+# check for wget
+which wget || { echo 'wget not found, please install.' && exit 1; }
+# download
+mkdir -p uff_faster_rcnn && \
+cd uff_faster_rcnn && \
+wget 'https://raw.githubusercontent.com/NVIDIA-AI-IOT/deepstream_4.x_apps/master/models/frcnn/faster_rcnn.pb' && \
+wget 'https://raw.githubusercontent.com/NVIDIA-AI-IOT/deepstream_4.x_apps/master/models/frcnn/2015_0502_034830_005_00001_rain_000179.ppm' && \
+wget 'https://raw.githubusercontent.com/NVIDIA-AI-IOT/deepstream_4.x_apps/master/models/frcnn/2016_1111_185016_003_00001_night_000441.ppm' && \
+wget 'https://raw.githubusercontent.com/NVIDIA-AI-IOT/deepstream_4.x_apps/master/models/frcnn/57ea04a57823530017bf15bf_000000.ppm' && \
+wget 'https://raw.githubusercontent.com/NVIDIA-AI-IOT/deepstream_4.x_apps/master/models/frcnn/57ea04a57823530017bf15bf_001008.ppm' && \
+ls *.ppm | cut -d. -f1 >> list.txt && \
+echo 'Model downloading finished !' && \
+cd ..
diff --git a/samples/opensource/sampleUffFasterRCNN/fix_softmax.patch b/samples/opensource/sampleUffFasterRCNN/fix_softmax.patch
new file mode 100644
index 00000000..ce99b5ae
--- /dev/null
+++ b/samples/opensource/sampleUffFasterRCNN/fix_softmax.patch
@@ -0,0 +1,11 @@
+--- converter_functions.py	2019-07-30 14:02:10.215925898 +0800
++++ converter_functions_fix_softmax.py	2019-07-30 13:53:46.187910972 +0800
+@@ -231,7 +231,7 @@
+     else:
+         axis = 0
+     fmt = convert_to_str(tf_node.attr['data_format'].s)
+-    fmt = fmt if fmt else "NHWC"
++    fmt = fmt if fmt else "NCHW"
+     data_fmt = tf2uff.convert_tf2uff_data_format(fmt)
+     uff_graph.softmax(inputs[0], axis, data_fmt, name)
+     return [tf2uff.split_node_name_and_output(inp)[0] for inp in inputs]
diff --git a/samples/opensource/sampleUffFasterRCNN/frcnnUtils.h b/samples/opensource/sampleUffFasterRCNN/frcnnUtils.h
new file mode 100644
index 00000000..8db87017
--- /dev/null
+++ b/samples/opensource/sampleUffFasterRCNN/frcnnUtils.h
@@ -0,0 +1,540 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FRCNN_UTILS_H
+#define FRCNN_UTILS_H
+
+#include "NvInfer.h"
+#include "argsParser.h"
+#include "common.h"
+#include <algorithm>
+#include <assert.h>
+#include <stdio.h>
+#include <vector>
+
+using namespace samplesCommon;
+
+//! \brief Split a string at a delimiter(defaults to comma).
+//!
+void splitStr(const char* s, std::vector<std::string>& ret, char del = ',')
+{
+    int idx = 0;
+    while (std::string::npos != std::string(s + idx).find(std::string(1, del)))
+    {
+        auto s_tmp = std::string(s + idx).substr(0, std::string(s + idx).find(std::string(1, del)));
+        ret.push_back(s_tmp);
+        idx += std::string(s + idx).find(std::string(1, del)) + 1;
+    }
+    if (s[idx] != 0)
+    {
+        ret.push_back(std::string(s + idx));
+    }
+}
+
+//! \class
+//!
+//! \brief The command line arguments for this sample.
+//!
+struct FrcnnArgs
+{
+    bool runInInt8{false};
+    bool runInFp16{false};
+    bool help{false};
+    int useDLACore{-1};
+    std::vector<std::string> dataDirs;
+    int inputHeight;
+    int inputWidth;
+    int repeat{1};
+    bool profile{false};
+    int batchSize{1};
+    std::vector<std::string> inputImages;
+    std::string saveEngine{""};
+    std::string loadEngine{""};
+};
+
+//!
+//! \brief Populates the Args struct with the provided command-line parameters.
+//!
+//! \throw invalid_argument if any of the arguments are not valid
+//!
+//! \return boolean If return value is true, execution can continue, otherwise program should exit
+//!
+bool parseFrcnnArgs(FrcnnArgs& args, int argc, char* argv[])
+{
+    while (1)
+    {
+        int arg;
+        static struct option long_options[] = {{"help", no_argument, 0, 'h'}, {"datadir", required_argument, 0, 'd'},
+            {"int8", no_argument, 0, 'i'}, {"fp16", no_argument, 0, 'f'}, {"useDLACore", required_argument, 0, 'u'},
+            {"inputHeight", required_argument, 0, 'H'}, {"inputWidth", required_argument, 0, 'w'},
+            {"repeat", required_argument, 0, 'r'}, {"profile", no_argument, 0, 'p'},
+            {"batchSize", required_argument, 0, 'b'}, {"inputImages", required_argument, 0, 'I'},
+            {"saveEngine", required_argument, 0, 's'}, {"loadEngine", required_argument, 0, 'l'},
+            {nullptr, 0, nullptr, 0}};
+        int option_index = 0;
+        arg = getopt_long(argc, argv, "hd:ifuH:W:r:pB:I:s:l:", long_options, &option_index);
+
+        if (arg == -1)
+        {
+            break;
+        }
+
+        switch (arg)
+        {
+        case 'h': args.help = true; return true;
+
+        case 'd':
+            if (optarg)
+            {
+                args.dataDirs.push_back(optarg);
+            }
+            else
+            {
+                std::cerr << "ERROR: --datadir requires option argument" << std::endl;
+                return false;
+            }
+
+            break;
+
+        case 'i': args.runInInt8 = true; break;
+
+        case 'f': args.runInFp16 = true; break;
+
+        case 'u':
+            if (optarg)
+            {
+                args.useDLACore = std::stoi(optarg);
+            }
+
+            break;
+
+        case 'H': args.inputHeight = std::stoi(optarg); break;
+
+        case 'W': args.inputWidth = std::stoi(optarg); break;
+
+        case 'r': args.repeat = std::stoi(optarg); break;
+
+        case 'p': args.profile = true; break;
+
+        case 'B': args.batchSize = std::stoi(optarg); break;
+
+        case 'I': splitStr(optarg, args.inputImages); break;
+        case 's': args.saveEngine = optarg; break;
+        case 'l': args.loadEngine = optarg; break;
+        default: return false;
+        }
+    }
+
+    return true;
+}
+
+//! \brief resize PPM on-the-fly so that user can specify input dimensions as commandline args.
+//!
+void resizePPM(vPPM& ppm, int target_width, int target_height)
+{
+    auto clip = [](float in, float low, float high) -> float { return (in < low) ? low : (in > high ? high : in); };
+    int original_height = ppm.h;
+    int original_width = ppm.w;
+    ppm.h = target_height;
+    ppm.w = target_width;
+    float ratio_h = static_cast<float>(original_height - 1.0f) / static_cast<float>(target_height - 1.0f);
+    float ratio_w = static_cast<float>(original_width - 1.0f) / static_cast<float>(target_width - 1.0f);
+    std::vector<uint8_t> tmp_buf;
+
+    for (int y = 0; y < target_height; ++y)
+    {
+        for (int x = 0; x < target_width; ++x)
+        {
+            float x0 = static_cast<float>(x) * ratio_w;
+            float y0 = static_cast<float>(y) * ratio_h;
+            int left = static_cast<int>(clip(std::floor(x0), 0.0f, static_cast<float>(original_width - 1.0f)));
+            int top = static_cast<int>(clip(std::floor(y0), 0.0f, static_cast<float>(original_height - 1.0f)));
+            int right = static_cast<int>(clip(std::ceil(x0), 0.0f, static_cast<float>(original_width - 1.0f)));
+            int bottom = static_cast<int>(clip(std::ceil(y0), 0.0f, static_cast<float>(original_height - 1.0f)));
+
+            for (int c = 0; c < 3; ++c)
+            {
+                // H, W, C ordering
+                uint8_t left_top_val = ppm.buffer[top * (original_width * 3) + left * (3) + c];
+                uint8_t right_top_val = ppm.buffer[top * (original_width * 3) + right * (3) + c];
+                uint8_t left_bottom_val = ppm.buffer[bottom * (original_width * 3) + left * (3) + c];
+                uint8_t right_bottom_val = ppm.buffer[bottom * (original_width * 3) + right * (3) + c];
+                float top_lerp = left_top_val + (right_top_val - left_top_val) * (x0 - left);
+                float bottom_lerp = left_bottom_val + (right_bottom_val - left_bottom_val) * (x0 - left);
+                float lerp = clip(std::round(top_lerp + (bottom_lerp - top_lerp) * (y0 - top)), 0.0f, 255.0f);
+                tmp_buf.push_back(static_cast<uint8_t>(lerp));
+            }
+        }
+    }
+
+    ppm.buffer = tmp_buf;
+}
+
+//! \class BatchStream
+//!
+//! \brief Custom BatchStream class for Faster-RCNN because we use variable input dimensions and different image
+//! preprocessing.
+//!
+class BatchStream
+{
+public:
+    BatchStream(
+        int batchSize, int maxBatches, std::string prefix, std::string suffix, std::vector<std::string> directories)
+        : mBatchSize(batchSize)
+        , mMaxBatches(maxBatches)
+        , mPrefix(prefix)
+        , mSuffix(suffix)
+        , mDataDir(directories)
+    {
+        FILE* file = fopen(locateFile(mPrefix + std::string("0") + mSuffix, mDataDir).c_str(), "rb");
+        assert(file != nullptr);
+        int d[4];
+        size_t readSize = fread(d, sizeof(int), 4, file);
+        assert(readSize == 4);
+        mDims.nbDims = 4;  // The number of dimensions.
+        mDims.d[0] = d[0]; // Batch Size
+        mDims.d[1] = d[1]; // Channels
+        mDims.d[2] = d[2]; // Height
+        mDims.d[3] = d[3]; // Width
+        assert(mDims.d[0] > 0 && mDims.d[1] > 0 && mDims.d[2] > 0 && mDims.d[3] > 0);
+        fclose(file);
+        mImageSize = mDims.d[1] * mDims.d[2] * mDims.d[3];
+        mBatch.resize(mBatchSize * mImageSize, 0);
+        mLabels.resize(mBatchSize, 0);
+        mFileBatch.resize(mDims.d[0] * mImageSize, 0);
+        mFileLabels.resize(mDims.d[0], 0);
+        reset(0);
+    }
+
+    BatchStream(int batchSize, int maxBatches, std::string prefix, std::vector<std::string> directories)
+        : BatchStream(batchSize, maxBatches, prefix, ".batch", directories)
+    {
+    }
+
+    BatchStream(
+        int batchSize, int maxBatches, nvinfer1::Dims dims, std::string listFile, std::vector<std::string> directories)
+        : mBatchSize(batchSize)
+        , mMaxBatches(maxBatches)
+        , mDims(dims)
+        , mListFile(listFile)
+        , mDataDir(directories)
+    {
+        mImageSize = mDims.d[1] * mDims.d[2] * mDims.d[3];
+        mBatch.resize(mBatchSize * mImageSize, 0);
+        mLabels.resize(mBatchSize, 0);
+        mFileBatch.resize(mDims.d[0] * mImageSize, 0);
+        mFileLabels.resize(mDims.d[0], 0);
+        reset(0);
+    }
+
+    // Resets data members
+    void reset(int firstBatch)
+    {
+        mBatchCount = 0;
+        mFileCount = 0;
+        mFileBatchPos = mDims.d[0];
+        skip(firstBatch);
+    }
+
+    // Advance to next batch and return true, or return false if there is no batch left.
+    bool next()
+    {
+        if (mBatchCount == mMaxBatches)
+        {
+            return false;
+        }
+
+        for (int csize = 1, batchPos = 0; batchPos < mBatchSize; batchPos += csize, mFileBatchPos += csize)
+        {
+            assert(mFileBatchPos > 0 && mFileBatchPos <= mDims.d[0]);
+
+            if (mFileBatchPos == mDims.d[0] && !update())
+            {
+                return false;
+            }
+
+            // copy the smaller of: elements left to fulfill the request, or elements left in the file buffer.
+            csize = std::min(mBatchSize - batchPos, mDims.d[0] - mFileBatchPos);
+            std::copy_n(
+                getFileBatch() + mFileBatchPos * mImageSize, csize * mImageSize, getBatch() + batchPos * mImageSize);
+            std::copy_n(getFileLabels() + mFileBatchPos, csize, getLabels() + batchPos);
+        }
+
+        mBatchCount++;
+        return true;
+    }
+
+    // Skips the batches
+    void skip(int skipCount)
+    {
+        if (mBatchSize >= mDims.d[0] && mBatchSize % mDims.d[0] == 0 && mFileBatchPos == mDims.d[0])
+        {
+            mFileCount += skipCount * mBatchSize / mDims.d[0];
+            return;
+        }
+
+        int x = mBatchCount;
+
+        for (int i = 0; i < skipCount; i++)
+        {
+            next();
+        }
+
+        mBatchCount = x;
+    }
+
+    float* getBatch()
+    {
+        return mBatch.data();
+    }
+
+    float* getLabels()
+    {
+        return mLabels.data();
+    }
+
+    int getBatchesRead() const
+    {
+        return mBatchCount;
+    }
+
+    int getBatchSize() const
+    {
+        return mBatchSize;
+    }
+
+    nvinfer1::Dims getDims() const
+    {
+        return mDims;
+    }
+
+private:
+    float* getFileBatch()
+    {
+        return mFileBatch.data();
+    }
+
+    float* getFileLabels()
+    {
+        return mFileLabels.data();
+    }
+
+    bool update()
+    {
+        if (mListFile.empty())
+        {
+            std::string inputFileName = locateFile(mPrefix + std::to_string(mFileCount++) + mSuffix, mDataDir);
+            FILE* file = fopen(inputFileName.c_str(), "rb");
+
+            if (!file)
+            {
+                return false;
+            }
+
+            int d[4];
+            size_t readSize = fread(d, sizeof(int), 4, file);
+            assert(readSize == 4);
+            assert(mDims.d[0] == d[0] && mDims.d[1] == d[1] && mDims.d[2] == d[2] && mDims.d[3] == d[3]);
+            size_t readInputCount = fread(getFileBatch(), sizeof(float), mDims.d[0] * mImageSize, file);
+            assert(readInputCount == size_t(mDims.d[0] * mImageSize));
+            size_t readLabelCount = fread(getFileLabels(), sizeof(float), mDims.d[0], file);
+            assert(readLabelCount == 0 || readLabelCount == size_t(mDims.d[0]));
+            fclose(file);
+        }
+        else
+        {
+            std::vector<std::string> fNames;
+            std::ifstream file(locateFile(mListFile, mDataDir), std::ios::binary);
+
+            if (!file)
+            {
+                return false;
+            }
+
+            gLogInfo << "Batch #" << mFileCount << std::endl;
+            file.seekg(((mBatchCount * mBatchSize)) * 7);
+
+            for (int i = 1; i <= mBatchSize; i++)
+            {
+                std::string sName;
+                std::getline(file, sName);
+                sName = sName + ".ppm";
+                gLogInfo << "Calibrating with file " << sName << std::endl;
+                fNames.emplace_back(sName);
+            }
+
+            mFileCount++;
+            std::vector<vPPM> ppms(fNames.size());
+
+            for (uint32_t i = 0; i < fNames.size(); ++i)
+            {
+                readPPMFile(fNames[i], ppms[i], mDataDir);
+            }
+
+            std::vector<float> data(samplesCommon::volume(mDims));
+            // Normalize input data
+            float pixelMean[3]{103.939, 116.779, 123.68};
+
+            for (int i = 0, volImg = mDims.d[1] * mDims.d[2] * mDims.d[3]; i < mBatchSize; ++i)
+            {
+                for (int c = 0; c < mDims.d[1]; ++c)
+                {
+                    for (unsigned j = 0, volChl = mDims.d[2] * mDims.d[3]; j < volChl; ++j)
+                    {
+                        data[i * volImg + c * volChl + j]
+                            = float(ppms[i].buffer[j * mDims.d[1] + 2 - c]) - pixelMean[c];
+                    }
+                }
+            }
+
+            std::copy_n(data.data(), mDims.d[0] * mImageSize, getFileBatch());
+        }
+
+        mFileBatchPos = 0;
+        return true;
+    }
+
+    int mBatchSize{0};
+    int mMaxBatches{0};
+    int mBatchCount{0};
+    int mFileCount{0};
+    int mFileBatchPos{0};
+    int mImageSize{0};
+    std::vector<float> mBatch;         //!< Data for the batch
+    std::vector<float> mLabels;        //!< Labels for the batch
+    std::vector<float> mFileBatch;     //!< List of image files
+    std::vector<float> mFileLabels;    //!< List of label files
+    std::string mPrefix;               //!< Batch file name prefix
+    std::string mSuffix;               //!< Batch file name suffix
+    nvinfer1::Dims mDims;              //!< Input dimensions
+    std::string mListFile;             //!< File name of the list of image names
+    std::vector<std::string> mDataDir; //!< Directories where the files can be found
+};
+
+//! \class EntropyCalibratorImpl
+//!
+//! \brief Implements common functionality for Entropy calibrators.
+//!
+class EntropyCalibratorImpl
+{
+public:
+    EntropyCalibratorImpl(
+        BatchStream& stream, int firstBatch, std::string networkName, const char* inputBlobName, bool readCache = true)
+        : mStream(stream)
+        , mCalibrationTableName("CalibrationTable" + networkName)
+        , mInputBlobName(inputBlobName)
+        , mReadCache(readCache)
+    {
+        nvinfer1::Dims dims = mStream.getDims();
+        mInputCount = samplesCommon::volume(dims);
+        CHECK(cudaMalloc(&mDeviceInput, mInputCount * sizeof(float)));
+        mStream.reset(firstBatch);
+    }
+
+    virtual ~EntropyCalibratorImpl()
+    {
+        CHECK(cudaFree(mDeviceInput));
+    }
+
+    int getBatchSize() const
+    {
+        return mStream.getBatchSize();
+    }
+
+    bool getBatch(void* bindings[], const char* names[], int nbBindings)
+    {
+        if (!mStream.next())
+        {
+            return false;
+        }
+
+        CHECK(cudaMemcpy(mDeviceInput, mStream.getBatch(), mInputCount * sizeof(float), cudaMemcpyHostToDevice));
+        assert(!strcmp(names[0], mInputBlobName));
+        bindings[0] = mDeviceInput;
+        return true;
+    }
+
+    const void* readCalibrationCache(size_t& length)
+    {
+        mCalibrationCache.clear();
+        std::ifstream input(mCalibrationTableName, std::ios::binary);
+        input >> std::noskipws;
+
+        if (mReadCache && input.good())
+        {
+            std::copy(std::istream_iterator<char>(input), std::istream_iterator<char>(),
+                std::back_inserter(mCalibrationCache));
+        }
+
+        length = mCalibrationCache.size();
+        return length ? mCalibrationCache.data() : nullptr;
+    }
+
+    void writeCalibrationCache(const void* cache, size_t length)
+    {
+        std::ofstream output(mCalibrationTableName, std::ios::binary);
+        output.write(reinterpret_cast<const char*>(cache), length);
+    }
+
+private:
+    BatchStream mStream;
+    size_t mInputCount;
+    std::string mCalibrationTableName;
+    const char* mInputBlobName;
+    bool mReadCache{true};
+    void* mDeviceInput{nullptr};
+    std::vector<char> mCalibrationCache;
+};
+
+//! \class Int8EntropyCalibrator2
+//!
+//! \brief Implements Entropy calibrator 2.
+//!  CalibrationAlgoType is kENTROPY_CALIBRATION_2.
+//!
+class Int8EntropyCalibrator2 : public IInt8EntropyCalibrator2
+{
+public:
+    Int8EntropyCalibrator2(
+        BatchStream& stream, int firstBatch, const char* networkName, const char* inputBlobName, bool readCache = true)
+        : mImpl(stream, firstBatch, networkName, inputBlobName, readCache)
+    {
+    }
+
+    int getBatchSize() const override
+    {
+        return mImpl.getBatchSize();
+    }
+
+    bool getBatch(void* bindings[], const char* names[], int nbBindings) override
+    {
+        return mImpl.getBatch(bindings, names, nbBindings);
+    }
+
+    const void* readCalibrationCache(size_t& length) override
+    {
+        return mImpl.readCalibrationCache(length);
+    }
+
+    void writeCalibrationCache(const void* cache, size_t length) override
+    {
+        mImpl.writeCalibrationCache(cache, length);
+    }
+
+private:
+    EntropyCalibratorImpl mImpl;
+};
+
+#endif
diff --git a/samples/opensource/sampleUffFasterRCNN/sampleUffFasterRCNN.cpp b/samples/opensource/sampleUffFasterRCNN/sampleUffFasterRCNN.cpp
new file mode 100644
index 00000000..d4718ca7
--- /dev/null
+++ b/samples/opensource/sampleUffFasterRCNN/sampleUffFasterRCNN.cpp
@@ -0,0 +1,773 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+//!
+//! sampleFasterRCNN_uff.cpp
+//! This file contains the implementation of the Uff FasterRCNN sample. It creates the network using
+//! the FasterRCNN UFF model.
+//! It can be run with the following command line:
+//! Command: ./sample_uff_fasterRCNN [-h]
+//!
+
+#include "NvInferPlugin.h"
+#include "NvUffParser.h"
+#include "argsParser.h"
+#include "buffers.h"
+#include "common.h"
+#include "frcnnUtils.h"
+#include "logger.h"
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstring>
+#include <cuda_runtime_api.h>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <random>
+#include <sstream>
+#include <sys/stat.h>
+#include <time.h>
+
+using namespace samplesCommon;
+
+//! \brief Define the PPM objects as global variable.
+//!
+std::vector<vPPM> ppms;
+
+//! \brief The name of this sample.
+//!
+const std::string gSampleName = "TensorRT.sample_uff_fasterRCNN";
+
+//! \class
+//!
+//! \brief Define the parameters for this sample.
+//!
+struct SampleUffFasterRcnnParams : public samplesCommon::SampleParams
+{
+    std::string uffFileName; //!< The file name of the UFF model to use
+    std::string inputNodeName;
+    std::string outputClsName;
+    std::string outputRegName;
+    std::string outputProposalName;
+
+    std::vector<std::string> inputImages;
+    int inputChannels;
+    int inputHeight;
+    int inputWidth;
+    int outputClassSize;
+    int outputBboxSize;
+    float nmsIouThresholdClassifier;
+    float visualizeThreshold;
+    std::vector<float> classifierRegressorStd;
+    std::vector<std::string> classNames;
+
+    int postNmsTopN;
+    int calBatchSize;
+    int nbCalBatches;
+
+    int repeat;
+    bool profile;
+
+    std::string saveEngine;
+    std::string loadEngine;
+};
+
+//! \class
+//!
+//! \brief The class that defines the overall workflow of this sample.
+//!
+class SampleUffFasterRcnn
+{
+    template <typename T>
+    using SampleUniquePtr = std::unique_ptr<T, samplesCommon::InferDeleter>;
+
+public:
+    SampleUffFasterRcnn(const SampleUffFasterRcnnParams& params)
+        : mParams(params)
+        , mEngine(nullptr)
+    {
+    }
+
+    //!
+    //! \brief Function builds the network engine
+    //!
+    bool build();
+
+    //!
+    //! \brief Runs the TensorRT inference engine for this sample
+    //!
+    bool infer();
+
+    //!
+    //! \brief Cleans up any state created in the sample class
+    //!
+    bool teardown();
+
+private:
+    SampleUffFasterRcnnParams mParams; //!< The parameters for the sample.
+
+    nvinfer1::Dims mInputDims; //!< The dimensions of the input to the network.
+
+    std::shared_ptr<nvinfer1::ICudaEngine> mEngine; //!< The TensorRT engine used to run the network
+
+    //!
+    //! \brief Parses an UFF model for SSD and creates a TensorRT network
+    //!
+    bool constructNetwork(SampleUniquePtr<nvinfer1::IBuilder>& builder,
+        SampleUniquePtr<nvinfer1::INetworkDefinition>& network, SampleUniquePtr<nvuffparser::IUffParser>& parser);
+
+    //!
+    //! \brief Reads the input and mean data, preprocesses, and stores the result in a managed buffer
+    //!
+    bool processInput(const samplesCommon::BufferManager& buffers);
+
+    //!
+    //! \brief Filters output detections and verify results
+    //!
+    bool verifyOutput(const samplesCommon::BufferManager& buffers);
+
+    //!
+    //! \brief Helper function to do post-processing(apply delta to ROIs).
+    //!
+    void batch_inverse_transform_classifier(const float* roi_after_nms, int roi_num_per_img,
+        const float* classifier_cls, const float* classifier_regr, std::vector<float>& pred_boxes,
+        std::vector<int>& pred_cls_ids, std::vector<float>& pred_probs, std::vector<int>& box_num_per_img, int N);
+
+    //!
+    //! \brief NMS helper function in post-processing.
+    //!
+    std::vector<int> nms_classifier(std::vector<float>& boxes_per_cls, std::vector<float>& probs_per_cls,
+        float NMS_OVERLAP_THRESHOLD, int NMS_MAX_BOXES);
+
+    //!
+    //! \brief Helper function to dump bbox-overlayed images as PPM files.
+    //!
+    void visualize_boxes(int img_num, int class_num, std::vector<float>& pred_boxes, std::vector<float>& pred_probs,
+        std::vector<int>& pred_cls_ids, std::vector<int>& box_num_per_img, std::vector<vPPM>& ppms);
+};
+
+bool SampleUffFasterRcnn::build()
+{
+    initLibNvInferPlugins(&gLogger.getTRTLogger(), "");
+
+    if (mParams.loadEngine.size() > 0)
+    {
+        std::vector<char> trtModelStream;
+        size_t size{0};
+        std::ifstream file(mParams.loadEngine, std::ios::binary);
+        if (file.good())
+        {
+            file.seekg(0, file.end);
+            size = file.tellg();
+            file.seekg(0, file.beg);
+            trtModelStream.resize(size);
+            file.read(trtModelStream.data(), size);
+            file.close();
+        }
+
+        IRuntime* infer = nvinfer1::createInferRuntime(gLogger);
+        if (mParams.dlaCore >= 0)
+        {
+            infer->setDLACore(mParams.dlaCore);
+        }
+        mEngine = std::shared_ptr<nvinfer1::ICudaEngine>(
+            infer->deserializeCudaEngine(trtModelStream.data(), size, nullptr), samplesCommon::InferDeleter());
+
+        infer->destroy();
+        gLogInfo << "TRT Engine loaded from: " << mParams.loadEngine << endl;
+        if (!mEngine)
+        {
+            return false;
+        }
+        else
+        {
+            return true;
+        }
+    }
+
+    auto builder = SampleUniquePtr<nvinfer1::IBuilder>(nvinfer1::createInferBuilder(gLogger.getTRTLogger()));
+
+    if (mParams.dlaCore >= 0)
+    {
+        builder->setDefaultDeviceType(nvinfer1::DeviceType::kDLA);
+        builder->setDLACore(mParams.dlaCore);
+        builder->allowGPUFallback(true);
+    }
+    if (!builder)
+    {
+        return false;
+    }
+
+    auto network = SampleUniquePtr<nvinfer1::INetworkDefinition>(builder->createNetwork());
+
+    if (!network)
+    {
+        return false;
+    }
+
+    auto parser = SampleUniquePtr<nvuffparser::IUffParser>(nvuffparser::createUffParser());
+
+    if (!parser)
+    {
+        return false;
+    }
+
+    auto constructed = constructNetwork(builder, network, parser);
+
+    if (!constructed)
+    {
+        return false;
+    }
+
+    assert(network->getNbInputs() == 1);
+    mInputDims = network->getInput(0)->getDimensions();
+    assert(mInputDims.nbDims == 3);
+    assert(network->getNbOutputs() == 3);
+    return true;
+}
+
+bool SampleUffFasterRcnn::constructNetwork(SampleUniquePtr<nvinfer1::IBuilder>& builder,
+    SampleUniquePtr<nvinfer1::INetworkDefinition>& network, SampleUniquePtr<nvuffparser::IUffParser>& parser)
+{
+    parser->registerInput(mParams.inputNodeName.c_str(),
+        DimsCHW(mParams.inputChannels, mParams.inputHeight, mParams.inputWidth), nvuffparser::UffInputOrder::kNCHW);
+    parser->registerOutput(mParams.outputRegName.c_str());
+    parser->registerOutput(mParams.outputClsName.c_str());
+    parser->registerOutput(mParams.outputProposalName.c_str());
+    auto parsed = parser->parse(locateFile(mParams.uffFileName, mParams.dataDirs).c_str(), *network, DataType::kFLOAT);
+
+    if (!parsed)
+    {
+        return false;
+    }
+
+    builder->setMaxBatchSize(mParams.batchSize);
+    builder->setMaxWorkspaceSize(2_GiB);
+    if (mParams.fp16)
+    {
+        builder->setFp16Mode(true);
+    }
+    // Calibrator life time needs to last until after the engine is built.
+    std::unique_ptr<IInt8Calibrator> calibrator;
+
+    if (mParams.int8)
+    {
+        gLogInfo << "Using Entropy Calibrator 2" << std::endl;
+        const std::string listFileName = "list.txt";
+        const int imageC = 3;
+        const int imageH = mParams.inputHeight;
+        const int imageW = mParams.inputWidth;
+        const nvinfer1::DimsNCHW imageDims{mParams.calBatchSize, imageC, imageH, imageW};
+        BatchStream calibrationStream(
+            mParams.calBatchSize, mParams.nbCalBatches, imageDims, listFileName, mParams.dataDirs);
+        calibrator.reset(
+            new Int8EntropyCalibrator2(calibrationStream, 0, "UffFasterRcnn", mParams.inputNodeName.c_str()));
+        builder->setInt8Mode(true);
+        // Fallback to FP16 if there is no INT8 kernels.
+        builder->setFp16Mode(true);
+        builder->setInt8Calibrator(calibrator.get());
+    }
+
+    mEngine = std::shared_ptr<nvinfer1::ICudaEngine>(builder->buildCudaEngine(*network), samplesCommon::InferDeleter());
+
+    if (!mEngine)
+    {
+        return false;
+    }
+
+    if (mParams.saveEngine.size() > 0)
+    {
+        std::ofstream p(mParams.saveEngine, std::ios::binary);
+        if (!p)
+        {
+            return false;
+        }
+        nvinfer1::IHostMemory* ptr = mEngine->serialize();
+        assert(ptr);
+        p.write(reinterpret_cast<const char*>(ptr->data()), ptr->size());
+        ptr->destroy();
+        p.close();
+        gLogInfo << "TRT Engine file saved to: " << mParams.saveEngine << endl;
+    }
+
+    return true;
+}
+
+bool SampleUffFasterRcnn::infer()
+{
+    // Create RAII buffer manager object
+    samplesCommon::BufferManager buffers(mEngine, mParams.batchSize);
+    auto context = SampleUniquePtr<nvinfer1::IExecutionContext>(mEngine->createExecutionContext());
+    SimpleProfiler profiler("FasterRCNN performance");
+
+    if (mParams.profile)
+    {
+        context->setProfiler(&profiler);
+    }
+
+    if (!context)
+    {
+        return false;
+    }
+
+    // Read the input data into the managed buffers
+    if (!processInput(buffers))
+    {
+        return false;
+    }
+
+    // Memcpy from host input buffers to device input buffers
+    buffers.copyInputToDevice();
+    bool status;
+
+    for (int i = 0; i < mParams.repeat; ++i)
+    {
+        status = context->execute(mParams.batchSize, buffers.getDeviceBindings().data());
+    }
+
+    if (!status)
+    {
+        return false;
+    }
+
+    if (mParams.profile)
+    {
+        std::cout << profiler;
+    }
+
+    // Memcpy from device output buffers to host output buffers
+    buffers.copyOutputToHost();
+
+    // Post-process detections and verify results
+    if (!verifyOutput(buffers))
+    {
+        return false;
+    }
+
+    return true;
+}
+
+bool SampleUffFasterRcnn::teardown()
+{
+    //! Clean up the libprotobuf files as the parsing is complete
+    //! \note It is not safe to use any other part of the protocol buffers library after
+    //! ShutdownProtobufLibrary() has been called.
+    nvuffparser::shutdownProtobufLibrary();
+    return true;
+}
+
+bool SampleUffFasterRcnn::processInput(const samplesCommon::BufferManager& buffers)
+{
+    const int inputC = mParams.inputChannels;
+    const int inputH = mParams.inputHeight;
+    const int inputW = mParams.inputWidth;
+    const int batchSize = mParams.batchSize;
+    std::vector<std::string> imageList = mParams.inputImages;
+    ppms.resize(batchSize);
+    assert(ppms.size() <= imageList.size());
+
+    for (int i = 0; i < batchSize; ++i)
+    {
+        readPPMFile(imageList[i], ppms[i], mParams.dataDirs);
+        // resize to input dimensions.
+        resizePPM(ppms[i], inputW, inputH);
+    }
+
+    // subtract image channel mean
+    float* hostDataBuffer = static_cast<float*>(buffers.getHostBuffer(mParams.inputNodeName));
+    float pixelMean[3]{103.939, 116.779, 123.68};
+
+    for (int i = 0, volImg = inputC * inputH * inputW; i < batchSize; ++i)
+    {
+        for (int c = 0; c < inputC; ++c)
+        {
+            for (unsigned j = 0, volChl = inputH * inputW; j < volChl; ++j)
+            {
+                hostDataBuffer[i * volImg + c * volChl + j] = float(ppms[i].buffer[j * inputC + 2 - c]) - pixelMean[c];
+            }
+        }
+    }
+
+    return true;
+}
+
+bool SampleUffFasterRcnn::verifyOutput(const samplesCommon::BufferManager& buffers)
+{
+    const int batchSize = mParams.batchSize;
+    const int outputClassSize = mParams.outputClassSize;
+    std::vector<float> classifierRegressorStd;
+    std::vector<std::string> classNames;
+    const float* out_class = static_cast<const float*>(buffers.getHostBuffer(mParams.outputClsName));
+    const float* out_reg = static_cast<const float*>(buffers.getHostBuffer(mParams.outputRegName));
+    const float* out_proposal = static_cast<const float*>(buffers.getHostBuffer(mParams.outputProposalName));
+    // host memory for outputs
+    std::vector<float> pred_boxes;
+    std::vector<int> pred_cls_ids;
+    std::vector<float> pred_probs;
+    std::vector<int> box_num_per_img;
+
+    int post_nms_top_n = mParams.postNmsTopN;
+    // post processing for stage 2.
+    batch_inverse_transform_classifier(out_proposal, post_nms_top_n, out_class, out_reg, pred_boxes, pred_cls_ids,
+        pred_probs, box_num_per_img, batchSize);
+    visualize_boxes(batchSize, outputClassSize, pred_boxes, pred_probs, pred_cls_ids, box_num_per_img, ppms);
+    return true;
+}
+
+SampleUffFasterRcnnParams initializeSampleParams(const FrcnnArgs& args)
+{
+    SampleUffFasterRcnnParams params;
+
+    if (args.dataDirs.empty())
+    {
+        // Use default directories if user hasn't provided directory paths
+        params.dataDirs.push_back("data/faster-rcnn/");
+        params.dataDirs.push_back("data/samples/faster-rcnn/");
+    }
+    else
+    {
+        // Use the data directory provided by the user
+        params.dataDirs = args.dataDirs;
+        params.dataDirs.push_back("data/faster-rcnn/");
+        params.dataDirs.push_back("data/samples/faster-rcnn/");
+    }
+
+    assert(args.batchSize == args.inputImages.size());
+    params.inputImages = args.inputImages;
+    params.uffFileName = "faster_rcnn.uff";
+    params.inputNodeName = "input_1";
+    params.outputClsName = "dense_class/Softmax";
+    params.outputRegName = "dense_regress/BiasAdd";
+    params.outputProposalName = "proposal";
+    params.batchSize = args.batchSize;
+    params.classNames.push_back("Automobile");
+    params.classNames.push_back("Bicycle");
+    params.classNames.push_back("Person");
+    params.classNames.push_back("Roadsign");
+    params.classNames.push_back("background");
+    params.dlaCore = args.useDLACore;
+    params.int8 = args.runInInt8;
+    params.fp16 = args.runInFp16;
+    params.repeat = args.repeat;
+    params.profile = args.profile;
+    params.inputChannels = 3;
+    params.inputHeight = args.inputHeight;
+    params.inputWidth = args.inputWidth;
+    params.nmsIouThresholdClassifier = 0.3f;
+    params.visualizeThreshold = 0.6f;
+    params.classifierRegressorStd.push_back(10.0f);
+    params.classifierRegressorStd.push_back(10.0f);
+    params.classifierRegressorStd.push_back(5.0f);
+    params.classifierRegressorStd.push_back(5.0f);
+    params.outputClassSize = params.classNames.size();
+    params.outputBboxSize = (params.outputClassSize - 1) * 4;
+    params.postNmsTopN = 300;
+    params.calBatchSize = 4;
+    params.nbCalBatches = 1;
+
+    params.saveEngine = args.saveEngine;
+    params.loadEngine = args.loadEngine;
+
+    return params;
+}
+
+void printHelpInfo()
+{
+    std::cout << "Usage: ./sample_uff_fasterRCNN [OPTIONS]" << std::endl;
+    std::cout << "--help[-h]              Display help information" << std::endl;
+    std::cout << "--datadir[-d]           Specify path to a data directory, overriding "
+                 "the default. This option can be repeated to add multiple directories."
+                 " If the option is unspecified, the default is to search"
+                 " data/faster-rcnn/ and data/samples/faster-rcnn/."
+              << std::endl;
+    std::cout << "--useDLACore[-u]        Specify a DLA engine for layers that support DLA. "
+                 "Value can range from 0 to n-1, "
+                 "where n is the number of DLA engines on the platform."
+              << std::endl;
+    std::cout << "--fp16[-f]              Specify to run in fp16 mode." << std::endl;
+    std::cout << "--int8[-i]              Specify to run in int8 mode." << std::endl;
+    std::cout << "--inputWidth[-W]        Specify the input width of the model." << std::endl;
+    std::cout << "--inputHeight[-H]       Specify the input height of the model." << std::endl;
+    std::cout << "--batchSize[-B]         Specify the batch size for inference." << std::endl;
+    std::cout << "--profile[-p]           Whether to do per-layer profiling." << std::endl;
+    std::cout << "--repeat[-r]            Specify the repeat number to execute the TRT context,"
+                 " used to smooth the profiling time."
+              << std::endl;
+    std::cout << "--inputImages[-I]       Specify the input images for inference." << std::endl;
+    std::cout << "--saveEngine[-s]        Path to save engine." << std::endl;
+    std::cout << "--loadEngine[-l]        Path to load engine." << std::endl;
+}
+
+//! \brief Define the function to apply delta to ROIs
+//!
+void SampleUffFasterRcnn::batch_inverse_transform_classifier(const float* roi_after_nms, int roi_num_per_img,
+    const float* classifier_cls, const float* classifier_regr, std::vector<float>& pred_boxes,
+    std::vector<int>& pred_cls_ids, std::vector<float>& pred_probs, std::vector<int>& box_num_per_img, int N)
+{
+    auto max_index = [](const float* start, const float* end) -> int {
+        float max_val = start[0];
+        int max_pos = 0;
+
+        for (int i = 1; start + i < end; ++i)
+        {
+            if (start[i] > max_val)
+            {
+                max_val = start[i];
+                max_pos = i;
+            }
+        }
+
+        return max_pos;
+    };
+    int box_num;
+
+    for (int n = 0; n < N; ++n)
+    {
+        box_num = 0;
+
+        for (int i = 0; i < roi_num_per_img; ++i)
+        {
+            auto max_idx = max_index(
+                classifier_cls + n * roi_num_per_img * mParams.outputClassSize + i * mParams.outputClassSize,
+                classifier_cls + n * roi_num_per_img * mParams.outputClassSize + i * mParams.outputClassSize
+                    + mParams.outputClassSize);
+
+            if (max_idx == (mParams.outputClassSize - 1)
+                || classifier_cls[n * roi_num_per_img * mParams.outputClassSize + max_idx + i * mParams.outputClassSize]
+                    < mParams.visualizeThreshold)
+            {
+                continue;
+            }
+
+            // inverse transform
+            float tx, ty, tw, th;
+            //(i, 20, 4)
+            tx = classifier_regr[n * roi_num_per_img * mParams.outputBboxSize + i * mParams.outputBboxSize
+                     + max_idx * 4]
+                / mParams.classifierRegressorStd[0];
+            ty = classifier_regr[n * roi_num_per_img * mParams.outputBboxSize + i * mParams.outputBboxSize + max_idx * 4
+                     + 1]
+                / mParams.classifierRegressorStd[1];
+            tw = classifier_regr[n * roi_num_per_img * mParams.outputBboxSize + i * mParams.outputBboxSize + max_idx * 4
+                     + 2]
+                / mParams.classifierRegressorStd[2];
+            th = classifier_regr[n * roi_num_per_img * mParams.outputBboxSize + i * mParams.outputBboxSize + max_idx * 4
+                     + 3]
+                / mParams.classifierRegressorStd[3];
+            float y = roi_after_nms[n * roi_num_per_img * 4 + 4 * i] * static_cast<float>(mParams.inputHeight);
+            float x = roi_after_nms[n * roi_num_per_img * 4 + 4 * i + 1] * static_cast<float>(mParams.inputWidth);
+            float ymax = roi_after_nms[n * roi_num_per_img * 4 + 4 * i + 2] * static_cast<float>(mParams.inputHeight);
+            float xmax = roi_after_nms[n * roi_num_per_img * 4 + 4 * i + 3] * static_cast<float>(mParams.inputWidth);
+            float w = xmax - x;
+            float h = ymax - y;
+            float cx = x + w / 2.0f;
+            float cy = y + h / 2.0f;
+            float cx1 = tx * w + cx;
+            float cy1 = ty * h + cy;
+            float w1 = std::round(std::exp(static_cast<double>(tw)) * w * 0.5f) * 2.0f;
+            float h1 = std::round(std::exp(static_cast<double>(th)) * h * 0.5f) * 2.0f;
+            float x1 = std::round((cx1 - w1 / 2.0f) * 0.5f) * 2.0f;
+            float y1 = std::round((cy1 - h1 / 2.0f) * 0.5f) * 2.0f;
+            auto clip
+                = [](float in, float low, float high) -> float { return (in < low) ? low : (in > high ? high : in); };
+            float x2 = x1 + w1;
+            float y2 = y1 + h1;
+            x1 = clip(x1, 0.0f, mParams.inputWidth - 1.0f);
+            y1 = clip(y1, 0.0f, mParams.inputHeight - 1.0f);
+            x2 = clip(x2, 0.0f, mParams.inputWidth - 1.0f);
+            y2 = clip(y2, 0.0f, mParams.inputHeight - 1.0f);
+
+            if (x2 > x1 && y2 > y1)
+            {
+                pred_boxes.push_back(x1);
+                pred_boxes.push_back(y1);
+                pred_boxes.push_back(x2);
+                pred_boxes.push_back(y2);
+                pred_probs.push_back(classifier_cls[n * roi_num_per_img * mParams.outputClassSize + max_idx
+                    + i * mParams.outputClassSize]);
+                pred_cls_ids.push_back(max_idx);
+                ++box_num;
+            }
+        }
+
+        box_num_per_img.push_back(box_num);
+    }
+}
+
+//! \brief NMS on CPU in post-processing of classifier outputs.
+//!
+std::vector<int> SampleUffFasterRcnn::nms_classifier(std::vector<float>& boxes_per_cls,
+    std::vector<float>& probs_per_cls, float NMS_OVERLAP_THRESHOLD, int NMS_MAX_BOXES)
+{
+    int num_boxes = boxes_per_cls.size() / 4;
+    std::vector<std::pair<float, int>> score_index;
+
+    for (int i = 0; i < num_boxes; ++i)
+    {
+        score_index.push_back(std::make_pair(probs_per_cls[i], i));
+    }
+
+    std::stable_sort(score_index.begin(), score_index.end(),
+        [](const std::pair<float, int>& pair1, const std::pair<float, int>& pair2) {
+            return pair1.first > pair2.first;
+        });
+    auto overlap1D = [](float x1min, float x1max, float x2min, float x2max) -> float {
+        if (x1min > x2min)
+        {
+            std::swap(x1min, x2min);
+            std::swap(x1max, x2max);
+        }
+
+        return x1max < x2min ? 0 : std::min(x1max, x2max) - x2min;
+    };
+    auto computeIoU = [&overlap1D](float* bbox1, float* bbox2) -> float {
+        float overlapX = overlap1D(bbox1[0], bbox1[2], bbox2[0], bbox2[2]);
+        float overlapY = overlap1D(bbox1[1], bbox1[3], bbox2[1], bbox2[3]);
+        float area1 = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1]);
+        float area2 = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1]);
+        float overlap2D = overlapX * overlapY;
+        float u = area1 + area2 - overlap2D;
+        return u == 0 ? 0 : overlap2D / u;
+    };
+    std::vector<int> indices;
+
+    for (auto i : score_index)
+    {
+        const int idx = i.second;
+        bool keep = true;
+
+        for (unsigned k = 0; k < indices.size(); ++k)
+        {
+            if (keep)
+            {
+                const int kept_idx = indices[k];
+                float overlap = computeIoU(&boxes_per_cls[idx * 4], &boxes_per_cls[kept_idx * 4]);
+                keep = overlap <= NMS_OVERLAP_THRESHOLD;
+            }
+            else
+            {
+                break;
+            }
+        }
+
+        if (indices.size() >= static_cast<unsigned>(NMS_MAX_BOXES))
+        {
+            break;
+        }
+
+        if (keep)
+        {
+            indices.push_back(idx);
+        }
+    }
+
+    return indices;
+}
+
+//! \brief Dump the detection results(bboxes) as PPM images, overlayed on original image.
+//!
+void SampleUffFasterRcnn::visualize_boxes(int img_num, int class_num, std::vector<float>& pred_boxes,
+    std::vector<float>& pred_probs, std::vector<int>& pred_cls_ids, std::vector<int>& box_num_per_img,
+    std::vector<vPPM>& ppms)
+{
+    int box_start_idx = 0;
+    std::vector<float> boxes_per_cls;
+    std::vector<float> probs_per_cls;
+    std::vector<BBox> det_per_img;
+
+    for (int i = 0; i < img_num; ++i)
+    {
+        det_per_img.clear();
+
+        for (int c = 0; c < (class_num - 1); ++c)
+        { // skip the background
+            boxes_per_cls.clear();
+            probs_per_cls.clear();
+
+            for (int k = box_start_idx; k < box_start_idx + box_num_per_img[i]; ++k)
+            {
+                if (pred_cls_ids[k] == c)
+                {
+                    boxes_per_cls.push_back(pred_boxes[4 * k]);
+                    boxes_per_cls.push_back(pred_boxes[4 * k + 1]);
+                    boxes_per_cls.push_back(pred_boxes[4 * k + 2]);
+                    boxes_per_cls.push_back(pred_boxes[4 * k + 3]);
+                    probs_per_cls.push_back(pred_probs[k]);
+                }
+            }
+
+            // apply NMS algorithm per class
+            auto indices_after_nms
+                = nms_classifier(boxes_per_cls, probs_per_cls, mParams.nmsIouThresholdClassifier, mParams.postNmsTopN);
+
+            // Show results
+            for (unsigned k = 0; k < indices_after_nms.size(); ++k)
+            {
+                int idx = indices_after_nms[k];
+                std::cout << "Detected " << mParams.classNames[c] << " in " << ppms[i].fileName << " with confidence "
+                          << probs_per_cls[idx] * 100.0f << "% " << std::endl;
+                BBox b{boxes_per_cls[idx * 4], boxes_per_cls[idx * 4 + 1], boxes_per_cls[idx * 4 + 2],
+                    boxes_per_cls[idx * 4 + 3]};
+                det_per_img.push_back(b);
+            }
+        }
+
+        box_start_idx += box_num_per_img[i];
+        writePPMFileWithBBox(ppms[i].fileName + "_det.ppm", ppms[i], det_per_img);
+    }
+}
+
+int main(int argc, char** argv)
+{
+    FrcnnArgs args;
+    bool argsOK = parseFrcnnArgs(args, argc, argv);
+
+    if (!argsOK)
+    {
+        gLogError << "Invalid arguments" << std::endl;
+        printHelpInfo();
+        return EXIT_FAILURE;
+    }
+
+    if (args.help)
+    {
+        printHelpInfo();
+        return EXIT_SUCCESS;
+    }
+
+    auto sampleTest = gLogger.defineTest(gSampleName, argc, const_cast<const char**>(argv));
+    gLogger.reportTestStart(sampleTest);
+    SampleUffFasterRcnn sample(initializeSampleParams(args));
+    gLogInfo << "Building and running a GPU inference engine for FasterRCNN" << std::endl;
+
+    if (!sample.build())
+    {
+        return gLogger.reportFail(sampleTest);
+    }
+
+    if (!sample.infer())
+    {
+        return gLogger.reportFail(sampleTest);
+    }
+
+    if (!sample.teardown())
+    {
+        return gLogger.reportFail(sampleTest);
+    }
+
+    return gLogger.reportPass(sampleTest);
+}
diff --git a/samples/opensource/sampleUffMNIST/sampleUffMNIST.cpp b/samples/opensource/sampleUffMNIST/sampleUffMNIST.cpp
index ba6fe283..44e7967c 100644
--- a/samples/opensource/sampleUffMNIST/sampleUffMNIST.cpp
+++ b/samples/opensource/sampleUffMNIST/sampleUffMNIST.cpp
@@ -124,6 +124,11 @@ bool SampleUffMNIST::build()
     {
         return false;
     }
+    auto config = SampleUniquePtr<nvinfer1::IBuilderConfig>(builder->createBuilderConfig());
+    if (!config)
+    {
+        return false;
+    }
     auto parser = SampleUniquePtr<nvuffparser::IUffParser>(nvuffparser::createUffParser());
     if (!parser)
     {
@@ -131,14 +136,21 @@ bool SampleUffMNIST::build()
     }
     constructNetwork(parser, network);
     builder->setMaxBatchSize(mParams.batchSize);
-    builder->setMaxWorkspaceSize(16_MB);
-    builder->allowGPUFallback(true);
-    builder->setFp16Mode(mParams.fp16);
-    builder->setInt8Mode(mParams.int8);
+    config->setMaxWorkspaceSize(16_MiB);
+    config->setFlag(BuilderFlag::kGPU_FALLBACK);
+    if (mParams.fp16)
+    {
+        config->setFlag(BuilderFlag::kFP16);
+    }
+    if (mParams.int8)
+    {
+        config->setFlag(BuilderFlag::kINT8);
+    }
 
-    samplesCommon::enableDLA(builder.get(), mParams.dlaCore);
+    samplesCommon::enableDLA(builder.get(), config.get(), mParams.dlaCore);
 
-    mEngine = std::shared_ptr<nvinfer1::ICudaEngine>(builder->buildCudaEngine(*network), samplesCommon::InferDeleter());
+    mEngine = std::shared_ptr<nvinfer1::ICudaEngine>(
+        builder->buildEngineWithConfig(*network, *config), samplesCommon::InferDeleter());
 
     if (!mEngine)
     {
diff --git a/samples/opensource/sampleUffMaskRCNN/CMakeLists.txt b/samples/opensource/sampleUffMaskRCNN/CMakeLists.txt
new file mode 100644
index 00000000..0a17a97b
--- /dev/null
+++ b/samples/opensource/sampleUffMaskRCNN/CMakeLists.txt
@@ -0,0 +1,23 @@
+#
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+set(SAMPLE_SOURCES
+    sampleUffMaskRCNN.cpp
+)
+
+set(SAMPLE_PARSERS "uff")
+set(PLUGINS_NEEDED ON)
+
+include(../../CMakeSamplesTemplate.txt)
diff --git a/samples/opensource/sampleUffMaskRCNN/README.md b/samples/opensource/sampleUffMaskRCNN/README.md
new file mode 100644
index 00000000..4075245c
--- /dev/null
+++ b/samples/opensource/sampleUffMaskRCNN/README.md
@@ -0,0 +1,179 @@
+# Object Detection And Instance Segmentations With A TensorFlow MasK R-CNN Network
+
+**Table Of Contents**
+- [Description](#description)
+- [How does this sample work?](#how-does-this-sample-work)
+    * [TensorRT API layers and ops](#tensorrt-api-layers-and-ops)
+- [Prerequisites](#prerequisites)
+- [Running the sample](#running-the-sample)
+	* [Sample `--help` options](#sample---help-options)
+- [Additional resources](#additional-resources)
+- [License](#license)
+- [Changelog](#changelog)
+- [Known issues](#known-issues)
+
+## Description
+
+This sample, sampleUffMaskRCNN, performs inference on the Mask R-CNN network in TensorRT. Mask R-CNN is based on the [Mask R-CNN](https://arxiv.org/abs/1703.06870) paper which performs the task of object detection and object mask predictions on a target image. This sample’s model is based on the Keras implementation of Mask R-CNN and its training framework can be found in the [Mask R-CNN Github repository](https://github.com/matterport/Mask_RCNN). We have verified that the pre-trained Keras model (with backbone ResNet101 + FPN and dataset coco) provided in the [v2.0](https://github.com/matterport/Mask_RCNN/releases/tag/v2.0) release can be converted to UFF and consumed by this sample. And, it is also feasible to deploy your customized Mask R-CNN model trained with specific backbone and datasets.
+
+**Note:** This sample is available only in GitHub and is not packaged with the product.
+
+## How does this sample work?
+
+This sample makes use of TensorRT plugins to run the Mask R-CNN model. To use these plugins, the Keras model should be converted to Tensorflow `.pb` model. Then this `.pb` model needs to be preprocessed and converted to the UFF model with the help of GraphSurgeon and the UFF utility.
+
+The main components of this network are the `ResizeNearest`, `ProposalLayer`, `PyramidROIAlign`, `DetectionLayer` and `SpecialSlice`.
+
+- `ResizeNearest` - Nearest neighbor interpolation for resizing features. This works for the FPN (Feature Pyramid Network) module.
+
+- `ProposalLayer` - Generate the first stage's proposals based on anchors and RPN's (Region Proposal Network) outputs (scores, bbox_deltas).
+
+- `PyramidROIAlign` - Crop and resize the feature of ROIs (first stage's proposals) from the corresponding feature layer.
+
+- `DetectionLayer` - Refine the first stage's proposals to produce final detections.
+  
+- `SpecialSlice` - A workaround plugin to slice detection output [y1, x1, y2, x2, class_id, score] to [y1, x1, y2 , x2] for data with more than one index dimensions (batch_idx, proposal_idx, detections(y1, x1, y2, x2)).
+
+
+### TensorRT API layers and ops
+
+In this sample, the following layers are used. For more information about these layers, see the [TensorRT Developer Guide: Layers](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#layers) documentation.
+
+[Activation layer](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#activation-layer)
+The Activation layer implements element-wise activation functions. Specifically, this sample uses the Activation layer with the type `kRELU`.
+
+[Convolution layer](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#convolution-layer)
+The Convolution layer computes a 2D (channel, height, and width) convolution, with or without bias.
+
+[Deconvolution layer](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#deconvolution-layer)
+The IDeconvolutionLayer computes a 2D (channel, height, and width) deconvolution, with or without bias.
+
+[Padding layer](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#padding-layer)
+The IPaddingLayer implements spatial zero-padding of tensors along the two innermost dimensions.
+
+[Plugin layer](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#plugin-layer)
+Plugin layers are user-defined and provide the ability to extend the functionalities of TensorRT. See [Extending TensorRT With Custom Layers](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#extending) for more details.
+
+[Pooling layer](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#pooling-layer)
+The Pooling layer implements pooling within a channel. Supported pooling types are `maximum`, `average` and `maximum-average blend`.
+
+
+## Prerequisites
+
+1.  Install the dependent Python packages.
+	```
+	pip install -r $TRT_SOURCE/samples/opensource/sampleUffMaskRCNN/converted/requirements.txt
+	```
+
+2.  Install the UFF toolkit and graph surgeon; depending on your TensorRT installation method. To install the toolkit and graph surgeon, choose the method you used to install TensorRT for instructions. See [TensorRT Installation Guide: Installing TensorRT](https://docs.nvidia.com/deeplearning/sdk/tensorrt-install-guide/index.html#installing).
+  
+3.  Modify the `conv2d_transpose` conversion function in UFF, for example `/usr/local/lib/python3.5/dist-packages/uff/converters/tensorflow/converter_functions.py` or `/usr/lib/python3.6/dist-packages/uff/converters/tensorflow/converter_functions.py`.
+	```    
+	uff_graph.conv_transpose(
+	    inputs[0], inputs[2], inputs[1],
+	    strides, padding,
+	    dilation=None, number_groups=number_groups,
+	    left_format=lhs_fmt, right_format=rhs_fmt,
+	    name=name, fields=fields
+	    )
+	```
+
+4.  Download the Mask R-CNN repo and export to `PYTHONPATH`.
+	```
+	git clone https://github.com/matterport/Mask_RCNN.git
+	export PYTHONPATH=$PYTHONPATH:$PWD/Mask_RCNN
+	```
+  
+5.  Apply the patch into Mask R-CNN repo to update the model from NHWC to NCHW.
+	```    
+	cd Mask_RCNN
+	git checkout 3deaec5
+	git am $TRT_SOURCE/samples/opensource/sampleUffMaskRCNN/converted/0001-Update-the-Mask_RCNN-model-from-NHWC-to-NCHW.patch
+	```
+
+6.  Download the pre-trained Keras model and place it into your `/data` folder
+	```
+	wget https://github.com/matterport/Mask_RCNN/releases/download/v2.0/mask_rcnn_coco.h5
+	```
+
+	**Note:** The md5sum of model file is e98aaff6f99e307b5e2a8a3ff741a518.
+
+7.  Convert the h5 model to the UFF model and place it into your `/data` folder
+	```
+	cd $TRT_SOURCE/samples/opensource/sampleUffMaskRCNN/converted/
+	python mrcnn_to_trt_single.py -w /path/to/data/mask_rcnn_coco.h5 -o /path/to/data/mrcnn_nchw.uff -p ./config.py
+	```
+ 
+8.  Populate your `/data` folder with the following test images.
+	```
+	/usr/src/tensorrt/data/faster-rcnn/001763.ppm
+	/usr/src/tensorrt/data/faster-rcnn/004545.ppm
+	```
+
+## Running the sample
+
+1.  Compile this sample by running `make` in the `<TensorRT root directory>/samples/sample_uff_maskRCNN` directory. The binary named `sample_uff_maskRCNN` will be created in the `<TensorRT root directory>/bin` directory.
+	```
+	cd <TensorRT root directory>/samples/sample_uff_maskRCNN
+	make
+	```
+ 
+	Where `<TensorRT root directory>` is where you installed TensorRT.  
+
+2.  Run the sample to perform object detection and object mask prediction.
+
+	To run the sample in FP32 mode:
+	```
+	./sample_uff_maskRCNN -d path/to/data
+	```
+
+	To run the sample in FP16 mode:
+	```
+	./sample_uff_maskRCNN -d path/to/data --fp16
+	```
+	
+3.  Verify that the sample ran successfully. If the sample runs successfully you should see output similar to the following:
+	```
+	[I] Detected dog in../../data/001763.ppm with confidence 99.9064 and coordinates (257.351, 14.2238, 489.272, 364.817)
+	[I] Detected dog in../../data/001763.ppm with confidence 99.8484 and coordinates (14.3269, 52.0974, 320.913, 363.364)
+	[I] The results are stored in current directory: 0.ppm
+	[I] Detected horse in../../data/004545.ppm with confidence 99.9796 and coordinates (164.81, 22.6816, 386.512, 308.955)
+	[I] Detected bottle in../../data/004545.ppm with confidence 98.5529 and coordinates (218.719, 237.04, 229.382, 261.205)
+	[I] The results are stored in current directory: 1.ppm
+	&&&& PASSED TensorRT.sample_maskrcnn # ../build/cmake/out/sample_uff_maskRCNN -d ../../data/
+	```
+	This output shows that the sample ran successfully; `PASSED`.
+
+### Sample `--help` options
+
+To see the full list of available options and their descriptions, use the `-h` or `--help` command line option.  
+```
+Usage: ./sample_maskRCNN [-h or --help] [-d or --datadir=<path to data directory>]
+--help Display help information
+--datadir Specify path to a data directory, overriding the default. This option can be used multiple times to add multiple directories. If no data directories are given, the default is to use data/samples/maskrcnn/ and data/maskrcnn/
+--fp16 Specify to run in fp16 mode.
+```
+
+## Additional resources
+
+The following resources provide a deeper understanding about sampleUffMaskRCNN.
+
+**Documentation**
+- [Introduction To NVIDIA’s TensorRT Samples](https://docs.nvidia.com/deeplearning/sdk/tensorrt-sample-support-guide/index.html#samples)
+- [Working With TensorRT Using The C++ API](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#c_topics)
+- [NVIDIA’s TensorRT Documentation Library](https://docs.nvidia.com/deeplearning/sdk/tensorrt-archived/index.html)
+
+## License
+
+For terms and conditions for use, reproduction, and distribution, see the [TensorRT Software License Agreement](https://docs.nvidia.com/deeplearning/sdk/tensorrt-sla/index.html) documentation.
+
+
+## Changelog
+
+July 2019
+This is the first release of the `README.md` file and sample.
+
+
+## Known issues
+
+1. Tensorflow installed from PyPI (`pip install tensorflow-gpu`) requires CUDA 10.0 and is incompatible with CUDA 10.1. To generate the UFF model required for this sample, use a container built with `CUDA_VERSION=10.0`.
diff --git a/samples/opensource/sampleUffMaskRCNN/converted/0001-Update-the-Mask_RCNN-model-from-NHWC-to-NCHW.patch b/samples/opensource/sampleUffMaskRCNN/converted/0001-Update-the-Mask_RCNN-model-from-NHWC-to-NCHW.patch
new file mode 100644
index 00000000..e4f0a56c
--- /dev/null
+++ b/samples/opensource/sampleUffMaskRCNN/converted/0001-Update-the-Mask_RCNN-model-from-NHWC-to-NCHW.patch
@@ -0,0 +1,241 @@
+From 00cfd7d7ce323df1f71048d959681349ca4fed78 Mon Sep 17 00:00:00 2001
+From: Nine Feng <nfeng@nvidia.com>
+Date: Wed, 24 Jul 2019 09:22:44 +0800
+Subject: [PATCH] Update the Mask_RCNN model from NHWC to NCHW
+
+1. Change all BN layers from NHWC to NCHW
+2. Modify class PyramidROIAlign to be compatible with NCHW format
+3. Change the input format in function `build_rpn_model`
+4. Permute the feature in function `rpn_graph` and change 'lambda' to 'Reshape'
+5. Change squeeze axis in function `fpn_classifier_graph`
+6. Change the input format in function `build` of class `MaskRCNN`
+7. (Optional) Change the input blob for prediction in function `detect` of class `MaskRCNN`
+8. Change urllib.request.urlopen to urllib.urlopen to be compatible with python 2.x
+---
+ mrcnn/model.py | 63 +++++++++++++++++++++++++++++++++++-----------------------
+ mrcnn/utils.py |  4 ++--
+ 2 files changed, 40 insertions(+), 27 deletions(-)
+
+diff --git a/mrcnn/model.py b/mrcnn/model.py
+index 62cb2b0..1508f2c 100644
+--- a/mrcnn/model.py
++++ b/mrcnn/model.py
+@@ -110,17 +110,17 @@ def identity_block(input_tensor, kernel_size, filters, stage, block,
+ 
+     x = KL.Conv2D(nb_filter1, (1, 1), name=conv_name_base + '2a',
+                   use_bias=use_bias)(input_tensor)
+-    x = BatchNorm(name=bn_name_base + '2a')(x, training=train_bn)
++    x = BatchNorm(name=bn_name_base + '2a', axis=1)(x, training=train_bn)
+     x = KL.Activation('relu')(x)
+ 
+     x = KL.Conv2D(nb_filter2, (kernel_size, kernel_size), padding='same',
+                   name=conv_name_base + '2b', use_bias=use_bias)(x)
+-    x = BatchNorm(name=bn_name_base + '2b')(x, training=train_bn)
++    x = BatchNorm(name=bn_name_base + '2b', axis=1)(x, training=train_bn)
+     x = KL.Activation('relu')(x)
+ 
+     x = KL.Conv2D(nb_filter3, (1, 1), name=conv_name_base + '2c',
+                   use_bias=use_bias)(x)
+-    x = BatchNorm(name=bn_name_base + '2c')(x, training=train_bn)
++    x = BatchNorm(name=bn_name_base + '2c', axis=1)(x, training=train_bn)
+ 
+     x = KL.Add()([x, input_tensor])
+     x = KL.Activation('relu', name='res' + str(stage) + block + '_out')(x)
+@@ -147,21 +147,21 @@ def conv_block(input_tensor, kernel_size, filters, stage, block,
+ 
+     x = KL.Conv2D(nb_filter1, (1, 1), strides=strides,
+                   name=conv_name_base + '2a', use_bias=use_bias)(input_tensor)
+-    x = BatchNorm(name=bn_name_base + '2a')(x, training=train_bn)
++    x = BatchNorm(name=bn_name_base + '2a', axis=1)(x, training=train_bn)
+     x = KL.Activation('relu')(x)
+ 
+     x = KL.Conv2D(nb_filter2, (kernel_size, kernel_size), padding='same',
+                   name=conv_name_base + '2b', use_bias=use_bias)(x)
+-    x = BatchNorm(name=bn_name_base + '2b')(x, training=train_bn)
++    x = BatchNorm(name=bn_name_base + '2b', axis=1)(x, training=train_bn)
+     x = KL.Activation('relu')(x)
+ 
+     x = KL.Conv2D(nb_filter3, (1, 1), name=conv_name_base +
+                   '2c', use_bias=use_bias)(x)
+-    x = BatchNorm(name=bn_name_base + '2c')(x, training=train_bn)
++    x = BatchNorm(name=bn_name_base + '2c', axis=1)(x, training=train_bn)
+ 
+     shortcut = KL.Conv2D(nb_filter3, (1, 1), strides=strides,
+                          name=conv_name_base + '1', use_bias=use_bias)(input_tensor)
+-    shortcut = BatchNorm(name=bn_name_base + '1')(shortcut, training=train_bn)
++    shortcut = BatchNorm(name=bn_name_base + '1', axis=1)(shortcut, training=train_bn)
+ 
+     x = KL.Add()([x, shortcut])
+     x = KL.Activation('relu', name='res' + str(stage) + block + '_out')(x)
+@@ -178,7 +178,7 @@ def resnet_graph(input_image, architecture, stage5=False, train_bn=True):
+     # Stage 1
+     x = KL.ZeroPadding2D((3, 3))(input_image)
+     x = KL.Conv2D(64, (7, 7), strides=(2, 2), name='conv1', use_bias=True)(x)
+-    x = BatchNorm(name='bn_conv1')(x, training=train_bn)
++    x = BatchNorm(name='bn_conv1', axis=1)(x, training=train_bn)
+     x = KL.Activation('relu')(x)
+     C1 = x = KL.MaxPooling2D((3, 3), strides=(2, 2), padding="same")(x)
+     # Stage 2
+@@ -365,6 +365,19 @@ class PyramidROIAlign(KE.Layer):
+         super(PyramidROIAlign, self).__init__(**kwargs)
+         self.pool_shape = tuple(pool_shape)
+ 
++    def NCHW_crop_and_resize(self, feature_map, level_boxes, box_indices, crop_size, method="bilinear"):
++        # NCHW(0,1,2,3) -> NHWC(0,2,3,1)
++        feature_map = tf.transpose(feature_map, [0, 2, 3, 1])
++
++        # crop and resize
++        box_feature = tf.image.crop_and_resize(feature_map, level_boxes,
++                box_indices, crop_size, method=method)
++
++        # NHWC(0,1,2,3) -> NCHW(0,3,1,2)
++        box_feature = tf.transpose(box_feature, [0, 3, 1, 2])
++
++        return box_feature
++
+     def call(self, inputs):
+         # Crop boxes [batch, num_boxes, (y1, x1, y2, x2)] in normalized coords
+         boxes = inputs[0]
+@@ -418,9 +431,7 @@ class PyramidROIAlign(KE.Layer):
+             # Here we use the simplified approach of a single value per bin,
+             # which is how it's done in tf.crop_and_resize()
+             # Result: [batch * num_boxes, pool_height, pool_width, channels]
+-            pooled.append(tf.image.crop_and_resize(
+-                feature_maps[i], level_boxes, box_indices, self.pool_shape,
+-                method="bilinear"))
++            pooled.append(self.NCHW_crop_and_resize(feature_maps[i], level_boxes, box_indices, self.pool_shape, method="bilinear"))
+ 
+         # Pack pooled features into one tensor
+         pooled = tf.concat(pooled, axis=0)
+@@ -447,7 +458,7 @@ class PyramidROIAlign(KE.Layer):
+         return pooled
+ 
+     def compute_output_shape(self, input_shape):
+-        return input_shape[0][:2] + self.pool_shape + (input_shape[2][-1], )
++        return input_shape[0][:2]  + (input_shape[2][1], ) + self.pool_shape
+ 
+ 
+ ############################################################
+@@ -853,8 +864,9 @@ def rpn_graph(feature_map, anchors_per_location, anchor_stride):
+                   activation='linear', name='rpn_class_raw')(shared)
+ 
+     # Reshape to [batch, anchors, 2]
+-    rpn_class_logits = KL.Lambda(
+-        lambda t: tf.reshape(t, [tf.shape(t)[0], -1, 2]))(x)
++    x = KL.Permute((2,3,1))(x)
++    rpn_class_logits = KL.Reshape((-1, 2))(x)
++    x = KL.Permute((2,3,1))(x)
+ 
+     # Softmax on last dimension of BG/FG.
+     rpn_probs = KL.Activation(
+@@ -866,7 +878,7 @@ def rpn_graph(feature_map, anchors_per_location, anchor_stride):
+                   activation='linear', name='rpn_bbox_pred')(shared)
+ 
+     # Reshape to [batch, anchors, 4]
+-    rpn_bbox = KL.Lambda(lambda t: tf.reshape(t, [tf.shape(t)[0], -1, 4]))(x)
++    rpn_bbox = KL.Reshape((-1, 4))(x)
+ 
+     return [rpn_class_logits, rpn_probs, rpn_bbox]
+ 
+@@ -887,7 +899,7 @@ def build_rpn_model(anchor_stride, anchors_per_location, depth):
+     rpn_bbox: [batch, H * W * anchors_per_location, (dy, dx, log(dh), log(dw))] Deltas to be
+                 applied to anchors.
+     """
+-    input_feature_map = KL.Input(shape=[None, None, depth],
++    input_feature_map = KL.Input(shape=[depth, None, None],
+                                  name="input_rpn_feature_map")
+     outputs = rpn_graph(input_feature_map, anchors_per_location, anchor_stride)
+     return KM.Model([input_feature_map], outputs, name="rpn_model")
+@@ -926,14 +938,14 @@ def fpn_classifier_graph(rois, feature_maps, image_meta,
+     # Two 1024 FC layers (implemented with Conv2D for consistency)
+     x = KL.TimeDistributed(KL.Conv2D(fc_layers_size, (pool_size, pool_size), padding="valid"),
+                            name="mrcnn_class_conv1")(x)
+-    x = KL.TimeDistributed(BatchNorm(), name='mrcnn_class_bn1')(x, training=train_bn)
++    x = KL.TimeDistributed(BatchNorm(axis=1), name='mrcnn_class_bn1')(x, training=train_bn)
+     x = KL.Activation('relu')(x)
+     x = KL.TimeDistributed(KL.Conv2D(fc_layers_size, (1, 1)),
+                            name="mrcnn_class_conv2")(x)
+-    x = KL.TimeDistributed(BatchNorm(), name='mrcnn_class_bn2')(x, training=train_bn)
++    x = KL.TimeDistributed(BatchNorm(axis=1), name='mrcnn_class_bn2')(x, training=train_bn)
+     x = KL.Activation('relu')(x)
+ 
+-    shared = KL.Lambda(lambda x: K.squeeze(K.squeeze(x, 3), 2),
++    shared = KL.Lambda(lambda x: K.squeeze(K.squeeze(x, 4), 3),
+                        name="pool_squeeze")(x)
+ 
+     # Classifier head
+@@ -976,25 +988,25 @@ def build_fpn_mask_graph(rois, feature_maps, image_meta,
+     # Conv layers
+     x = KL.TimeDistributed(KL.Conv2D(256, (3, 3), padding="same"),
+                            name="mrcnn_mask_conv1")(x)
+-    x = KL.TimeDistributed(BatchNorm(),
++    x = KL.TimeDistributed(BatchNorm(axis=1),
+                            name='mrcnn_mask_bn1')(x, training=train_bn)
+     x = KL.Activation('relu')(x)
+ 
+     x = KL.TimeDistributed(KL.Conv2D(256, (3, 3), padding="same"),
+                            name="mrcnn_mask_conv2")(x)
+-    x = KL.TimeDistributed(BatchNorm(),
++    x = KL.TimeDistributed(BatchNorm(axis=1),
+                            name='mrcnn_mask_bn2')(x, training=train_bn)
+     x = KL.Activation('relu')(x)
+ 
+     x = KL.TimeDistributed(KL.Conv2D(256, (3, 3), padding="same"),
+                            name="mrcnn_mask_conv3")(x)
+-    x = KL.TimeDistributed(BatchNorm(),
++    x = KL.TimeDistributed(BatchNorm(axis=1),
+                            name='mrcnn_mask_bn3')(x, training=train_bn)
+     x = KL.Activation('relu')(x)
+ 
+     x = KL.TimeDistributed(KL.Conv2D(256, (3, 3), padding="same"),
+                            name="mrcnn_mask_conv4")(x)
+-    x = KL.TimeDistributed(BatchNorm(),
++    x = KL.TimeDistributed(BatchNorm(axis=1),
+                            name='mrcnn_mask_bn4')(x, training=train_bn)
+     x = KL.Activation('relu')(x)
+ 
+@@ -1853,7 +1865,7 @@ class MaskRCNN():
+ 
+         # Inputs
+         input_image = KL.Input(
+-            shape=[None, None, config.IMAGE_SHAPE[2]], name="input_image")
++            shape=[config.IMAGE_SHAPE[2], 1024, 1024 ], name="input_image")
+         input_image_meta = KL.Input(shape=[config.IMAGE_META_SIZE],
+                                     name="input_image_meta")
+         if mode == "training":
+@@ -2520,8 +2532,9 @@ class MaskRCNN():
+             log("image_metas", image_metas)
+             log("anchors", anchors)
+         # Run object detection
++        molded_input_images = np.transpose(molded_images, (0, 3, 1, 2))
+         detections, _, _, mrcnn_mask, _, _, _ =\
+-            self.keras_model.predict([molded_images, image_metas, anchors], verbose=0)
++            self.keras_model.predict([molded_input_images, image_metas, anchors], verbose=0)
+         # Process detections
+         results = []
+         for i, image in enumerate(images):
+diff --git a/mrcnn/utils.py b/mrcnn/utils.py
+index ff93e10..baceb4c 100644
+--- a/mrcnn/utils.py
++++ b/mrcnn/utils.py
+@@ -18,7 +18,7 @@ import scipy
+ import skimage.color
+ import skimage.io
+ import skimage.transform
+-import urllib.request
++import urllib
+ import shutil
+ import warnings
+ from distutils.version import LooseVersion
+@@ -844,7 +844,7 @@ def download_trained_weights(coco_model_path, verbose=1):
+     """
+     if verbose > 0:
+         print("Downloading pretrained model to " + coco_model_path + " ...")
+-    with urllib.request.urlopen(COCO_MODEL_URL) as resp, open(coco_model_path, 'wb') as out:
++    with urllib.urlopen(COCO_MODEL_URL) as resp, open(coco_model_path, 'wb') as out:
+         shutil.copyfileobj(resp, out)
+     if verbose > 0:
+         print("... done downloading pretrained model!")
+-- 
+2.7.4
+
diff --git a/samples/opensource/sampleUffMaskRCNN/converted/config.py b/samples/opensource/sampleUffMaskRCNN/converted/config.py
new file mode 100644
index 00000000..d3c4d2fa
--- /dev/null
+++ b/samples/opensource/sampleUffMaskRCNN/converted/config.py
@@ -0,0 +1,125 @@
+#
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import graphsurgeon as gs
+import tensorflow as tf
+
+fpn_p5upsampled = gs.create_plugin_node("fpn_p5upsampled", op="ResizeNearest_TRT", dtype=tf.float32, scale=2.0)
+fpn_p4upsampled = gs.create_plugin_node("fpn_p4upsampled", op="ResizeNearest_TRT", dtype=tf.float32, scale=2.0)
+fpn_p3upsampled = gs.create_plugin_node("fpn_p3upsampled", op="ResizeNearest_TRT", dtype=tf.float32, scale=2.0)
+
+roi = gs.create_plugin_node("ROI", op="ProposalLayer_TRT", prenms_topk=1024, keep_topk=1000, iou_threshold=0.7)
+roi_align_classifier = gs.create_plugin_node("roi_align_classifier", op="PyramidROIAlign_TRT", pooled_size=7)
+mrcnn_detection = gs.create_plugin_node("mrcnn_detection", op="DetectionLayer_TRT", num_classes=81, keep_topk=100, score_threshold=0.7, iou_threshold=0.3)
+roi_align_mask = gs.create_plugin_node("roi_align_mask_trt", op="PyramidROIAlign_TRT", pooled_size=14)
+mrcnn_detection_bboxes = gs.create_plugin_node("mrcnn_detection_bboxes", op="SpecialSlice_TRT")
+
+namespace_plugin_map = {
+"fpn_p5upsampled":fpn_p5upsampled,
+
+"fpn_p4upsampled":fpn_p4upsampled,
+
+"fpn_p3upsampled":fpn_p3upsampled,
+
+"roi_align_classifier":roi_align_classifier,
+
+"mrcnn_detection":mrcnn_detection,
+
+"ROI":roi,
+
+"roi_align_mask":roi_align_mask,
+
+"lambda_1": mrcnn_detection_bboxes,
+
+}
+
+timedistributed_remove_list = [
+        "mrcnn_class_conv1/Reshape/shape", "mrcnn_class_conv1/Reshape", "mrcnn_class_conv1/Reshape_1/shape", "mrcnn_class_conv1/Reshape_1",
+        "mrcnn_class_bn1/Reshape/shape", "mrcnn_class_bn1/Reshape", "mrcnn_class_bn1/Reshape_5/shape", "mrcnn_class_bn1/Reshape_5",
+        "mrcnn_class_conv2/Reshape/shape", "mrcnn_class_conv2/Reshape", "mrcnn_class_conv2/Reshape_1/shape", "mrcnn_class_conv2/Reshape_1",
+        "mrcnn_class_bn2/Reshape/shape", "mrcnn_class_bn2/Reshape", "mrcnn_class_bn2/Reshape_5/shape", "mrcnn_class_bn2/Reshape_5",
+        "mrcnn_class_logits/Reshape/shape", "mrcnn_class_logits/Reshape","mrcnn_class_logits/Reshape_1/shape", "mrcnn_class_logits/Reshape_1",
+        "mrcnn_class/Reshape/shape", "mrcnn_class/Reshape","mrcnn_class/Reshape_1/shape", "mrcnn_class/Reshape_1",
+        "mrcnn_bbox_fc/Reshape/shape", "mrcnn_bbox_fc/Reshape","mrcnn_bbox_fc/Reshape_1/shape", "mrcnn_bbox_fc/Reshape_1",
+
+        "mrcnn_mask_conv1/Reshape/shape", "mrcnn_mask_conv1/Reshape", "mrcnn_mask_conv1/Reshape_1/shape", "mrcnn_mask_conv1/Reshape_1",
+        "mrcnn_mask_bn1/Reshape/shape", "mrcnn_mask_bn1/Reshape", "mrcnn_mask_bn1/Reshape_5/shape", "mrcnn_mask_bn1/Reshape_5",
+        "mrcnn_mask_conv2/Reshape/shape", "mrcnn_mask_conv2/Reshape", "mrcnn_mask_conv2/Reshape_1/shape", "mrcnn_mask_conv2/Reshape_1",
+        "mrcnn_mask_bn2/Reshape/shape", "mrcnn_mask_bn2/Reshape", "mrcnn_mask_bn2/Reshape_5/shape", "mrcnn_mask_bn2/Reshape_5",
+        "mrcnn_mask_conv3/Reshape/shape", "mrcnn_mask_conv3/Reshape", "mrcnn_mask_conv3/Reshape_1/shape", "mrcnn_mask_conv3/Reshape_1",
+        "mrcnn_mask_bn3/Reshape/shape", "mrcnn_mask_bn3/Reshape", "mrcnn_mask_bn3/Reshape_5/shape", "mrcnn_mask_bn3/Reshape_5",
+        "mrcnn_mask_conv4/Reshape/shape", "mrcnn_mask_conv4/Reshape", "mrcnn_mask_conv4/Reshape_1/shape", "mrcnn_mask_conv4/Reshape_1",
+        "mrcnn_mask_bn4/Reshape/shape", "mrcnn_mask_bn4/Reshape", "mrcnn_mask_bn4/Reshape_5/shape", "mrcnn_mask_bn4/Reshape_5",
+        "mrcnn_mask_deconv/Reshape/shape", "mrcnn_mask_deconv/Reshape", "mrcnn_mask_deconv/Reshape_1/shape", "mrcnn_mask_deconv/Reshape_1",
+        "mrcnn_mask/Reshape/shape", "mrcnn_mask/Reshape", "mrcnn_mask/Reshape_1/shape", "mrcnn_mask/Reshape_1",
+        ]
+
+timedistributed_connect_pairs = [
+        ("mrcnn_mask_deconv/Relu", "mrcnn_mask/convolution"), # mrcnn_mask_deconv -> mrcnn_mask
+        ("activation_74/Relu", "mrcnn_mask_deconv/conv2d_transpose"), #active74 -> mrcnn_mask_deconv
+        ("mrcnn_mask_bn4/batchnorm/add_1","activation_74/Relu"),  # mrcnn_mask_bn4 -> active74
+        ("mrcnn_mask_conv4/BiasAdd", "mrcnn_mask_bn4/batchnorm/mul_1"), #mrcnn_mask_conv4 -> mrcnn_mask_bn4
+        ("activation_73/Relu", "mrcnn_mask_conv4/convolution"), #active73 -> mrcnn_mask_conv4
+        ("mrcnn_mask_bn3/batchnorm/add_1","activation_73/Relu"), #mrcnn_mask_bn3 -> active73
+        ("mrcnn_mask_conv3/BiasAdd", "mrcnn_mask_bn3/batchnorm/mul_1"), #mrcnn_mask_conv3 -> mrcnn_mask_bn3
+        ("activation_72/Relu", "mrcnn_mask_conv3/convolution"), #active72 -> mrcnn_mask_conv3
+        ("mrcnn_mask_bn2/batchnorm/add_1","activation_72/Relu"), #mrcnn_mask_bn2 -> active72
+        ("mrcnn_mask_conv2/BiasAdd", "mrcnn_mask_bn2/batchnorm/mul_1"), #mrcnn_mask_conv2 -> mrcnn_mask_bn2
+        ("activation_71/Relu", "mrcnn_mask_conv2/convolution"), #active71 -> mrcnn_mask_conv2
+        ("mrcnn_mask_bn1/batchnorm/add_1","activation_71/Relu"), #mrcnn_mask_bn1 -> active71
+        ("mrcnn_mask_conv1/BiasAdd", "mrcnn_mask_bn1/batchnorm/mul_1"), #mrcnn_mask_conv1 -> mrcnn_mask_bn1
+        ("roi_align_mask_trt", "mrcnn_mask_conv1/convolution"), #roi_align_mask -> mrcnn_mask_conv1
+
+
+        ("mrcnn_class_bn2/batchnorm/add_1","activation_69/Relu"), # mrcnn_class_bn2 -> active 69
+        ("mrcnn_class_conv2/BiasAdd", "mrcnn_class_bn2/batchnorm/mul_1"), # mrcnn_class_conv2 -> mrcnn_class_bn2
+        ("activation_68/Relu", "mrcnn_class_conv2/convolution"), # active 68 -> mrcnn_class_conv2
+        ("mrcnn_class_bn1/batchnorm/add_1","activation_68/Relu"), # mrcnn_class_bn1 -> active 68
+        ("mrcnn_class_conv1/BiasAdd", "mrcnn_class_bn1/batchnorm/mul_1"), # mrcnn_class_conv1 -> mrcnn_class_bn1
+        ("roi_align_classifier", "mrcnn_class_conv1/convolution"), # roi_align_classifier -> mrcnn_class_conv1
+        ]
+
+dense_compatible_patch =["pool_squeeze/Squeeze", "pool_squeeze/Squeeze_1", #No need to squeeze the dimensions for TRT Dense Layer
+        "mrcnn_bbox/Shape", "mrcnn_bbox/strided_slice/stack", # mrcnn_bbox(Reshape): No need to reshape, cause we can process it as 1-D array in detectionlayer's kernel
+        "mrcnn_bbox/strided_slice/stack_1", "mrcnn_bbox/strided_slice/stack_2",
+        "mrcnn_bbox/strided_slice", "mrcnn_bbox/Reshape/shape/1",
+        "mrcnn_bbox/Reshape/shape/2", "mrcnn_bbox/Reshape/shape/3",
+        "mrcnn_bbox/Reshape/shape", "mrcnn_bbox/Reshape"]
+
+dense_compatible_connect_pairs = [
+        ("activation_69/Relu","mrcnn_bbox_fc/MatMul"), #activation_69 -> mrcnn_bbox_fc
+        ("activation_69/Relu", "mrcnn_class_logits/MatMul"), #activation_69 -> mrcnn_class_logits
+        ("mrcnn_class_logits/BiasAdd", "mrcnn_class/Softmax"), #mrcnn_class_logits -> mrcnn_class
+        ("mrcnn_class/Softmax", "mrcnn_detection"), #mrcnn_class -> mrcnn_detection
+        ("mrcnn_bbox_fc/BiasAdd", "mrcnn_detection"), #mrcnn_bbox_fc -> mrcnn_detection
+        ]
+
+def connect(dynamic_graph, connections_list):
+
+    for node_a_name, node_b_name in connections_list:
+        if node_a_name not in dynamic_graph.node_map[node_b_name].input:
+            dynamic_graph.node_map[node_b_name].input.insert(0, node_a_name)
+
+def preprocess(dynamic_graph):
+    # Now create a new graph by collapsing namespaces
+    dynamic_graph.collapse_namespaces(namespace_plugin_map, unique_inputs=True)
+    dynamic_graph.remove(timedistributed_remove_list)
+    dynamic_graph.remove(dense_compatible_patch)
+    dynamic_graph.remove(['input_anchors', 'input_image_meta'])
+
+    connect(dynamic_graph, timedistributed_connect_pairs)
+    connect(dynamic_graph, dense_compatible_connect_pairs)
+
diff --git a/samples/opensource/sampleUffMaskRCNN/converted/mrcnn_to_trt_single.py b/samples/opensource/sampleUffMaskRCNN/converted/mrcnn_to_trt_single.py
new file mode 100644
index 00000000..71450a19
--- /dev/null
+++ b/samples/opensource/sampleUffMaskRCNN/converted/mrcnn_to_trt_single.py
@@ -0,0 +1,164 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from keras.models import model_from_json, Model
+from keras import backend as K
+from keras.layers import Input, Lambda
+from tensorflow.python.framework import graph_util
+from tensorflow.python.framework import graph_io
+from mrcnn.model import *
+import mrcnn.model as modellib
+from mrcnn.config import Config
+import sys
+import os
+ROOT_DIR = os.path.abspath("./")
+LOG_DIR = os.path.join(ROOT_DIR, "logs")
+import argparse
+import os
+import uff
+
+
+def parse_command_line_arguments(args=None):
+    parser = argparse.ArgumentParser(prog='keras_to_trt', description='Convert trained keras .hdf5 model to trt .uff')
+
+    parser.add_argument(
+        '-w',
+        '--weights',
+        type=str,
+        default=None,
+        required=True,
+        help="The checkpoint weights file of keras model."
+    )
+
+    parser.add_argument(
+        '-o',
+        '--output_file',
+        type=str,
+        default=None,
+        required=True,
+        help="The path to output .uff file."
+    )
+
+    parser.add_argument(
+        '-l',
+        '--list-nodes',
+        action='store_true',
+        help="show list of nodes contained in converted pb"
+    )
+
+    parser.add_argument(
+        '-p',
+        '--preprocessor',
+        type=str,
+        default=False,
+        help="The preprocess function for converting tf node to trt plugin"
+    )
+
+    return parser.parse_args(args)
+
+
+class CocoConfig(Config):
+    """Configuration for training on MS COCO.
+    Derives from the base Config class and overrides values specific
+    to the COCO dataset.
+    """
+    # Give the configuration a recognizable name
+    NAME = "coco"
+
+    # We use a GPU with 12GB memory, which can fit two images.
+    # Adjust down if you use a smaller GPU.
+    IMAGES_PER_GPU = 2
+
+    # Uncomment to train on 8 GPUs (default is 1)
+    # GPU_COUNT = 8
+
+    # Number of classes (including background)
+    NUM_CLASSES = 1 + 80  # COCO has 80 classes
+
+class InferenceConfig(CocoConfig):
+    # Set batch size to 1 since we'll be running inference on
+    # one image at a time. Batch size = GPU_COUNT * IMAGES_PER_GPU
+    GPU_COUNT = 1
+    IMAGES_PER_GPU = 1
+
+def main(args=None):
+
+    K.set_image_data_format('channels_first')
+    K.set_learning_phase(0)
+
+    args = parse_command_line_arguments(args)
+
+    model_weights_path = args.weights
+    output_file_path = args.output_file
+    list_nodes = args.list_nodes
+
+    config = InferenceConfig()
+    config.display()
+
+    model = modellib.MaskRCNN(mode="inference", model_dir=LOG_DIR, config=config).keras_model
+
+    model.load_weights(model_weights_path, by_name=True)
+
+
+    model_A = Model(inputs=model.input, outputs=model.get_layer('mrcnn_mask').output)
+    model_A.summary()
+
+    output_nodes = ['mrcnn_detection', "mrcnn_mask/Sigmoid"]
+    convert_model(model_A, output_file_path, output_nodes, preprocessor=args.preprocessor,
+                  text=True, list_nodes=list_nodes)
+
+
+def convert_model(inference_model, output_path, output_nodes=[], preprocessor=None, text=False,
+                  list_nodes=False):
+    # convert the keras model to pb
+    orig_output_node_names = [node.op.name for node in inference_model.outputs]
+    print("The output names of tensorflow graph nodes: {}".format(str(orig_output_node_names)))
+
+    sess = K.get_session()
+
+    constant_graph = graph_util.convert_variables_to_constants(
+        sess,
+        sess.graph.as_graph_def(),
+        orig_output_node_names)
+
+    temp_pb_path = "../temp.pb"
+    graph_io.write_graph(constant_graph, os.path.dirname(temp_pb_path), os.path.basename(temp_pb_path),
+                         as_text=False)
+
+    predefined_output_nodes = output_nodes
+    if predefined_output_nodes != []:
+        trt_output_nodes = predefined_output_nodes
+    else:
+        trt_output_nodes = orig_output_node_names
+
+    # convert .pb to .uff
+    uff.from_tensorflow_frozen_model(
+        temp_pb_path,
+        output_nodes=trt_output_nodes,
+        preprocessor=preprocessor,
+        text=text,
+        list_nodes=list_nodes,
+        output_filename=output_path,
+        debug_mode = False
+    )
+
+    os.remove(temp_pb_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/samples/opensource/sampleUffMaskRCNN/converted/requirements.txt b/samples/opensource/sampleUffMaskRCNN/converted/requirements.txt
new file mode 100644
index 00000000..35569752
--- /dev/null
+++ b/samples/opensource/sampleUffMaskRCNN/converted/requirements.txt
@@ -0,0 +1,3 @@
+keras == 2.1.3
+tensorflow-gpu >= 1.9.0
+scikit-image
diff --git a/samples/opensource/sampleUffMaskRCNN/mrcnn_config.h b/samples/opensource/sampleUffMaskRCNN/mrcnn_config.h
new file mode 100644
index 00000000..6fa3b056
--- /dev/null
+++ b/samples/opensource/sampleUffMaskRCNN/mrcnn_config.h
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MASKRCNN_CONFIG_HEADER
+#define MASKRCNN_CONFIG_HEADER
+#include "NvInfer.h"
+#include <string>
+#include <vector>
+using namespace nvinfer1;
+
+namespace MaskRCNNConfig
+{
+static const nvinfer1::Dims3 IMAGE_SHAPE{3, 1024, 1024};
+
+// Pooled ROIs
+static const int POOL_SIZE = 7;
+static const int MASK_POOL_SIZE = 14;
+
+// Threshold to determine the mask area out of final convolution output
+static const float MASK_THRESHOLD = 0.5;
+
+// Bounding box refinement standard deviation for RPN and final detections.
+static const float RPN_BBOX_STD_DEV[] = {0.1, 0.1, 0.2, 0.2};
+static const float BBOX_STD_DEV[] = {0.1, 0.1, 0.2, 0.2};
+
+// Max number of final detections
+static const int DETECTION_MAX_INSTANCES = 100;
+
+// Minimum probability value to accept a detected instance
+// ROIs below this threshold are skipped
+static const float DETECTION_MIN_CONFIDENCE = 0.7;
+
+// Non-maximum suppression threshold for detection
+static const float DETECTION_NMS_THRESHOLD = 0.3;
+
+// The strides of each layer of the FPN Pyramid. These values
+// are based on a Resnet101 backbone.
+static const std::vector<float> BACKBONE_STRIDES = {4, 8, 16, 32, 64};
+
+// Size of the fully-connected layers in the classification graph
+static const int FPN_CLASSIF_FC_LAYERS_SIZE = 1024;
+
+// Size of the top-down layers used to build the feature pyramid
+static const int TOP_DOWN_PYRAMID_SIZE = 256;
+
+// Number of classification classes (including background)
+static const int NUM_CLASSES = 1 + 80; // COCO has 80 classes
+
+// Length of square anchor side in pixels
+static const std::vector<float> RPN_ANCHOR_SCALES = {32, 64, 128, 256, 512};
+
+// Ratios of anchors at each cell (width/height)
+// A value of 1 represents a square anchor, and 0.5 is a wide anchor
+static const float RPN_ANCHOR_RATIOS[] = {0.5, 1, 2};
+
+// Anchor stride
+// If 1 then anchors are created for each cell in the backbone feature map.
+// If 2, then anchors are created for every other cell, and so on.
+static const int RPN_ANCHOR_STRIDE = 1;
+
+// Although Python impementation uses 6000,
+//  TRT fails if this number larger than MAX_TOPK_K defined in engine/checkMacros.h
+static const int MAX_PRE_NMS_RESULTS = 1024; // 3840;
+
+// Non-max suppression threshold to filter RPN proposals.
+// You can increase this during training to generate more propsals.
+static const float RPN_NMS_THRESHOLD = 0.7;
+
+// ROIs kept after non-maximum suppression (training and inference)
+static const int POST_NMS_ROIS_INFERENCE = 1000;
+
+// COCO Class names
+static const std::vector<std::string> CLASS_NAMES = {
+    "BG",
+    "person",
+    "bicycle",
+    "car",
+    "motorcycle",
+    "airplane",
+    "bus",
+    "train",
+    "truck",
+    "boat",
+    "traffic light",
+    "fire hydrant",
+    "stop sign",
+    "parking meter",
+    "bench",
+    "bird",
+    "cat",
+    "dog",
+    "horse",
+    "sheep",
+    "cow",
+    "elephant",
+    "bear",
+    "zebra",
+    "giraffe",
+    "backpack",
+    "umbrella",
+    "handbag",
+    "tie",
+    "suitcase",
+    "frisbee",
+    "skis",
+    "snowboard",
+    "sports ball",
+    "kite",
+    "baseball bat",
+    "baseball glove",
+    "skateboard",
+    "surfboard",
+    "tennis racket",
+    "bottle",
+    "wine glass",
+    "cup",
+    "fork",
+    "knife",
+    "spoon",
+    "bowl",
+    "banana",
+    "apple",
+    "sandwich",
+    "orange",
+    "broccoli",
+    "carrot",
+    "hot dog",
+    "pizza",
+    "donut",
+    "cake",
+    "chair",
+    "couch",
+    "potted plant",
+    "bed",
+    "dining table",
+    "toilet",
+    "tv",
+    "laptop",
+    "mouse",
+    "remote",
+    "keyboard",
+    "cell phone",
+    "microwave",
+    "oven",
+    "toaster",
+    "sink",
+    "refrigerator",
+    "book",
+    "clock",
+    "vase",
+    "scissors",
+    "teddy bear",
+    "hair drier",
+    "toothbrush",
+};
+
+static const std::string MODEL_NAME = "mrcnn_nchw.uff";
+static const std::string MODEL_INPUT = "input_image";
+static const Dims3 MODEL_INPUT_SHAPE = IMAGE_SHAPE;
+static const std::vector<std::string> MODEL_OUTPUTS = {"mrcnn_detection", "mrcnn_mask/Sigmoid"};
+static const Dims2 MODEL_DETECTION_SHAPE{DETECTION_MAX_INSTANCES, 6};
+static const Dims4 MODEL_MASK_SHAPE{DETECTION_MAX_INSTANCES, NUM_CLASSES, 28, 28};
+} // namespace MaskRCNNConfig
+#endif
diff --git a/samples/opensource/sampleUffMaskRCNN/sampleUffMaskRCNN.cpp b/samples/opensource/sampleUffMaskRCNN/sampleUffMaskRCNN.cpp
new file mode 100644
index 00000000..97a1fabc
--- /dev/null
+++ b/samples/opensource/sampleUffMaskRCNN/sampleUffMaskRCNN.cpp
@@ -0,0 +1,680 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <assert.h>
+#include <chrono>
+#include <ctime>
+#include <cuda_runtime_api.h>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <time.h>
+#include <vector>
+
+#include "NvInfer.h"
+#include "NvUffParser.h"
+
+#include "argsParser.h"
+#include "buffers.h"
+#include "common.h"
+#include "logger.h"
+
+// max
+#include <algorithm>
+
+// data type
+#include <unistd.h>
+
+// MaskRCNN Parameter
+#include "mrcnn_config.h"
+
+const std::string gSampleName = "TensorRT.sample_maskrcnn";
+
+namespace MaskRCNNUtils
+{
+struct RawDetection
+{
+    float y1, x1, y2, x2, class_id, score;
+};
+
+struct Mask
+{
+    float raw[MaskRCNNConfig::MASK_POOL_SIZE * 2 * MaskRCNNConfig::MASK_POOL_SIZE * 2];
+};
+
+struct BBoxInfo
+{
+    samplesCommon::BBox box;
+    int label = -1;
+    float prob = 0.0f;
+
+    Mask* mask = nullptr;
+};
+
+template <typename T>
+struct PPM
+{
+    std::string magic, fileName;
+    int h, w, max;
+    std::vector<T> buffer;
+};
+
+void readPPMFile(const std::string& filename, PPM<uint8_t>& ppm)
+{
+    ppm.fileName = filename;
+    std::ifstream infile(filename, std::ifstream::binary);
+    assert(infile.is_open() && "Attempting to read from a file that is not open. ");
+    infile >> ppm.magic >> ppm.w >> ppm.h >> ppm.max;
+    infile.seekg(1, infile.cur);
+
+    ppm.buffer.resize(ppm.w * ppm.h * 3, 0);
+
+    infile.read(reinterpret_cast<char*>(ppm.buffer.data()), ppm.w * ppm.h * 3);
+}
+
+void writePPMFile(const std::string& filename, PPM<uint8_t>& ppm)
+{
+    std::ofstream outfile("./" + filename, std::ofstream::binary);
+    assert(!outfile.fail());
+    outfile << "P6"
+            << "\n"
+            << ppm.w << " " << ppm.h << "\n"
+            << ppm.max << "\n";
+
+    outfile.write(reinterpret_cast<char*>(ppm.buffer.data()), ppm.w * ppm.h * 3);
+}
+
+template <typename T>
+void resizePPM(const PPM<T>& src, PPM<T>& dst, int target_height, int target_width, int channel)
+{
+    auto clip = [](float in, float low, float high) -> float { return (in < low) ? low : (in > high ? high : in); };
+    int original_height = src.h;
+    int original_width = src.w;
+    assert(dst.h == target_height);
+    assert(dst.w == target_width);
+    float ratio_h = static_cast<float>(original_height - 1.0f) / static_cast<float>(target_height - 1.0f);
+    float ratio_w = static_cast<float>(original_width - 1.0f) / static_cast<float>(target_width - 1.0f);
+
+    int dst_idx = 0;
+    for (int y = 0; y < target_height; ++y)
+    {
+        for (int x = 0; x < target_width; ++x)
+        {
+            float x0 = static_cast<float>(x) * ratio_w;
+            float y0 = static_cast<float>(y) * ratio_h;
+            int left = static_cast<int>(clip(std::floor(x0), 0.0f, static_cast<float>(original_width - 1.0f)));
+            int top = static_cast<int>(clip(std::floor(y0), 0.0f, static_cast<float>(original_height - 1.0f)));
+            int right = static_cast<int>(clip(std::ceil(x0), 0.0f, static_cast<float>(original_width - 1.0f)));
+            int bottom = static_cast<int>(clip(std::ceil(y0), 0.0f, static_cast<float>(original_height - 1.0f)));
+
+            for (int c = 0; c < channel; ++c)
+            {
+                // H, W, C ordering
+                T left_top_val = src.buffer[top * (original_width * channel) + left * (channel) + c];
+                T right_top_val = src.buffer[top * (original_width * channel) + right * (channel) + c];
+                T left_bottom_val = src.buffer[bottom * (original_width * channel) + left * (channel) + c];
+                T right_bottom_val = src.buffer[bottom * (original_width * channel) + right * (channel) + c];
+                float top_lerp = left_top_val + (right_top_val - left_top_val) * (x0 - left);
+                float bottom_lerp = left_bottom_val + (right_bottom_val - left_bottom_val) * (x0 - left);
+                float lerp = clip(std::round(top_lerp + (bottom_lerp - top_lerp) * (y0 - top)), 0.0f, 255.0f);
+                dst.buffer[dst_idx] = (static_cast<T>(lerp));
+                dst_idx++;
+            }
+        }
+    }
+}
+
+void padPPM(const PPM<uint8_t>& src, PPM<uint8_t>& dst, int top, int bottom, int left, int right)
+{
+    assert(dst.h == (src.h + top + bottom));
+    assert(dst.w == (src.w + left + right));
+
+    for (int y = 0; y < src.h; y++)
+    {
+        for (int x = 0; x < src.w; x++)
+        {
+            for (int c = 0; c < 3; c++)
+            {
+                dst.buffer[(top + y) * dst.w * 3 + (left + x) * 3 + c] = src.buffer[y * src.w * 3 + x * 3 + c];
+            }
+        }
+    }
+}
+
+void preprocessPPM(PPM<uint8_t>& src, PPM<uint8_t>& dst, int target_h, int target_w)
+{
+    assert(target_h == target_w);
+    int input_dim = target_h;
+    // padding the input img to model's input_size:
+    const int image_dim = std::max(src.h, src.w);
+    int resize_h = src.h * input_dim / image_dim;
+    int resize_w = src.w * input_dim / image_dim;
+    assert(resize_h == input_dim || resize_w == input_dim);
+
+    int y_offset = (input_dim - resize_h) / 2;
+    int x_offset = (input_dim - resize_w) / 2;
+
+    // resize
+    PPM<uint8_t> resized_ppm;
+    resized_ppm.h = resize_h;
+    resized_ppm.w = resize_w;
+    resized_ppm.max = src.max;
+    resized_ppm.buffer.resize(resize_h * resize_w * 3, 0);
+    resizePPM<uint8_t>(src, resized_ppm, resize_h, resize_w, 3);
+
+    // pad
+    dst.h = target_h;
+    dst.w = target_w;
+    dst.max = src.max;
+    dst.buffer.resize(dst.h * dst.w * 3, 0);
+    padPPM(resized_ppm, dst, y_offset, input_dim - resize_h - y_offset, x_offset, input_dim - resize_w - x_offset);
+}
+
+PPM<uint8_t> resizeMask(const BBoxInfo& box, const float mask_threshold)
+{
+    PPM<uint8_t> result;
+    if (!box.mask)
+    {
+        assert(result.buffer.size() == 0);
+        return result;
+    }
+
+    const int h = box.box.y2 - box.box.y1;
+    const int w = box.box.x2 - box.box.x1;
+
+    PPM<float> raw_mask;
+    raw_mask.h = MaskRCNNConfig::MASK_POOL_SIZE * 2;
+    raw_mask.w = MaskRCNNConfig::MASK_POOL_SIZE * 2;
+    raw_mask.buffer.resize(raw_mask.h * raw_mask.w, 0);
+    for (int i = 0; i < raw_mask.h * raw_mask.w; i++)
+        raw_mask.buffer[i] = box.mask->raw[i];
+
+    PPM<float> resized_mask;
+    resized_mask.h = h;
+    resized_mask.w = w;
+    resized_mask.buffer.resize(h * w, 0);
+    resizePPM<float>(raw_mask, resized_mask, h, w, 1);
+
+    result.h = h;
+    result.w = w;
+    result.buffer.resize(result.h * result.w, 0);
+    for (int i = 0; i < h * w; i++)
+    {
+        if (resized_mask.buffer[i] > mask_threshold)
+        {
+            result.buffer[i] = 1;
+        }
+    }
+
+    return result;
+}
+
+void maskPPM(
+    PPM<uint8_t>& image, const PPM<uint8_t>& mask, const int start_x, const int start_y, const std::vector<int>& color)
+{
+
+    float alpha = 0.6f;
+
+    for (int y = 0; y < mask.h; ++y)
+    {
+        for (int x = 0; x < mask.w; ++x)
+        {
+            uint8_t mask_pixel = mask.buffer[y * mask.w + x];
+            if (mask_pixel == 1)
+            {
+                assert(0 <= start_y + y && start_y + y < image.h);
+                assert(0 <= start_x + x && start_x + x < image.w);
+
+                int cur_y = start_y + y;
+                int cur_x = start_x + x;
+
+                float p_r = static_cast<float>(image.buffer[(cur_y * image.w + cur_x) * 3]);
+                float p_g = static_cast<float>(image.buffer[(cur_y * image.w + cur_x) * 3 + 1]);
+                float p_b = static_cast<float>(image.buffer[(cur_y * image.w + cur_x) * 3 + 2]);
+
+                image.buffer[(cur_y * image.w + cur_x) * 3]
+                    = static_cast<uint8_t>(std::max(0.0f, std::min(255.0f, p_r * (1 - alpha) + color[0] * alpha)));
+                image.buffer[(cur_y * image.w + cur_x) * 3 + 1]
+                    = static_cast<uint8_t>(std::max(0.0f, std::min(255.0f, p_g * (1 - alpha) + color[1] * alpha)));
+                image.buffer[(cur_y * image.w + cur_x) * 3 + 2]
+                    = static_cast<uint8_t>(std::max(0.0f, std::min(255.0f, p_b * (1 - alpha) + color[2] * alpha)));
+            }
+            else
+                assert(mask_pixel == 0);
+        }
+    }
+}
+void addBBoxPPM(PPM<uint8_t>& ppm, const BBoxInfo& box, const PPM<uint8_t>& resized_mask)
+{
+    const int x1 = box.box.x1;
+    const int y1 = box.box.y1;
+    const int x2 = box.box.x2;
+    const int y2 = box.box.y2;
+    std::vector<int> color = {rand() % 256, rand() % 256, rand() % 256};
+
+    for (int x = x1; x <= x2; x++)
+    {
+        // bbox top border
+        ppm.buffer[(y1 * ppm.w + x) * 3] = color[0];
+        ppm.buffer[(y1 * ppm.w + x) * 3 + 1] = color[1];
+        ppm.buffer[(y1 * ppm.w + x) * 3 + 2] = color[2];
+        // bbox bottom border
+        ppm.buffer[(y2 * ppm.w + x) * 3] = color[0];
+        ppm.buffer[(y2 * ppm.w + x) * 3 + 1] = color[1];
+        ppm.buffer[(y2 * ppm.w + x) * 3 + 2] = color[2];
+    }
+
+    for (int y = y1; y <= y2; y++)
+    {
+        // bbox left border
+        ppm.buffer[(y * ppm.w + x1) * 3] = color[0];
+        ppm.buffer[(y * ppm.w + x1) * 3 + 1] = color[1];
+        ppm.buffer[(y * ppm.w + x1) * 3 + 2] = color[2];
+        // bbox right border
+        ppm.buffer[(y * ppm.w + x2) * 3] = color[0];
+        ppm.buffer[(y * ppm.w + x2) * 3 + 1] = color[1];
+        ppm.buffer[(y * ppm.w + x2) * 3 + 2] = color[2];
+    }
+
+    if (resized_mask.buffer.size() != 0)
+    {
+        maskPPM(ppm, resized_mask, x1, y1, color);
+    }
+}
+} // namespace MaskRCNNUtils
+
+struct SampleMaskRCNNParams : public samplesCommon::SampleParams
+{
+    std::string uffFileName;
+    float maskThreshold;
+};
+
+class SampleMaskRCNN
+{
+    template <typename T>
+    using SampleUniquePtr = std::unique_ptr<T, samplesCommon::InferDeleter>;
+
+public:
+    SampleMaskRCNN(const SampleMaskRCNNParams& params)
+        : mParams(params)
+        , mEngine(nullptr)
+    {
+        srand((int) time(0));
+    }
+
+    bool build();
+
+    bool infer();
+
+    bool teardown();
+
+private:
+    SampleMaskRCNNParams mParams;
+
+    nvinfer1::Dims mInputDims;
+
+    // original images
+    std::vector<MaskRCNNUtils::PPM<uint8_t>> mOriginalPPMs;
+
+    // processed images (resize + pad)
+    std::vector<MaskRCNNUtils::PPM<uint8_t>> mPPMs;
+
+    std::shared_ptr<nvinfer1::ICudaEngine> mEngine;
+
+    bool constructNetwork(SampleUniquePtr<nvinfer1::IBuilder>& builder,
+        SampleUniquePtr<nvinfer1::INetworkDefinition>& network, SampleUniquePtr<nvuffparser::IUffParser>& parser);
+
+    bool processInput(const samplesCommon::BufferManager& buffers);
+
+    bool verifyOutput(const samplesCommon::BufferManager& buffers);
+
+    vector<MaskRCNNUtils::BBoxInfo> decodeOutput(const int imageIdx, void* detectionsHost, void* masksHost);
+};
+
+bool SampleMaskRCNN::build()
+{
+    initLibNvInferPlugins(&gLogger.getTRTLogger(), "");
+    auto builder = SampleUniquePtr<nvinfer1::IBuilder>(nvinfer1::createInferBuilder(gLogger.getTRTLogger()));
+    if (!builder)
+    {
+        return false;
+    }
+
+    auto network = SampleUniquePtr<nvinfer1::INetworkDefinition>(builder->createNetwork());
+    if (!network)
+    {
+        return false;
+    }
+
+    auto parser = SampleUniquePtr<nvuffparser::IUffParser>(nvuffparser::createUffParser());
+    if (!parser)
+    {
+        return false;
+    }
+
+    auto constructed = constructNetwork(builder, network, parser);
+    if (!constructed)
+    {
+        return false;
+    }
+
+    assert(network->getNbInputs() == 1);
+    mInputDims = network->getInput(0)->getDimensions();
+    assert(mInputDims.nbDims == 3);
+
+    assert(network->getNbOutputs() == 2);
+
+    return true;
+}
+
+bool SampleMaskRCNN::constructNetwork(SampleUniquePtr<nvinfer1::IBuilder>& builder,
+    SampleUniquePtr<nvinfer1::INetworkDefinition>& network, SampleUniquePtr<nvuffparser::IUffParser>& parser)
+{
+    parser->registerInput(
+        mParams.inputTensorNames[0].c_str(), MaskRCNNConfig::IMAGE_SHAPE, nvuffparser::UffInputOrder::kNCHW);
+    for (size_t i = 0; i < mParams.outputTensorNames.size(); i++)
+        parser->registerOutput(mParams.outputTensorNames[i].c_str());
+
+    auto parsed = parser->parse(locateFile(mParams.uffFileName, mParams.dataDirs).c_str(), *network, DataType::kFLOAT);
+    if (!parsed)
+    {
+        return false;
+    }
+
+    builder->setMaxBatchSize(mParams.batchSize);
+    builder->setMaxWorkspaceSize(2_GiB);
+    builder->setFp16Mode(mParams.fp16);
+
+    // Only for speed test
+    if (mParams.int8)
+    {
+        samplesCommon::setAllTensorScales(network.get());
+        builder->setInt8Mode(true);
+    }
+
+    mEngine = std::shared_ptr<nvinfer1::ICudaEngine>(builder->buildCudaEngine(*network), samplesCommon::InferDeleter());
+    if (!mEngine)
+    {
+        return false;
+    }
+
+    return true;
+}
+
+bool SampleMaskRCNN::infer()
+{
+    // Create RAII buffer manager object
+    samplesCommon::BufferManager buffers(mEngine, mParams.batchSize);
+
+    auto context = SampleUniquePtr<nvinfer1::IExecutionContext>(mEngine->createExecutionContext());
+    if (!context)
+    {
+        return false;
+    }
+
+    // Read the input data into the managed buffers
+    assert(mParams.inputTensorNames.size() == 1);
+    if (!processInput(buffers))
+    {
+        return false;
+    }
+
+    // Memcpy from host input buffers to device input buffers
+    buffers.copyInputToDevice();
+
+    auto tStart = std::chrono::high_resolution_clock::now();
+    bool status;
+    for (int i = 0; i < 10; i++)
+    {
+        status = context->execute(mParams.batchSize, buffers.getDeviceBindings().data());
+    }
+    auto tEnd = std::chrono::high_resolution_clock::now();
+    float totalHost = std::chrono::duration<float, std::milli>(tEnd - tStart).count();
+    gLogInfo << "Run for 10 times with Batch Size " << mParams.batchSize << std::endl;
+    gLogInfo << "Average inference time is " << (totalHost / 10) / mParams.batchSize << " ms/frame" << std::endl;
+
+    if (!status)
+    {
+        return false;
+    }
+
+    // Memcpy from device output buffers to host output buffers
+    buffers.copyOutputToHost();
+
+    // Post-process detections and verify results
+    if (!verifyOutput(buffers))
+    {
+        return false;
+    }
+
+    return true;
+}
+
+bool SampleMaskRCNN::teardown()
+{
+    //! Clean up the libprotobuf files as the parsing is complete
+    //! \note It is not safe to use any other part of the protocol buffers library after
+    //! ShutdownProtobufLibrary() has been called.
+    nvuffparser::shutdownProtobufLibrary();
+    return true;
+}
+
+bool SampleMaskRCNN::processInput(const samplesCommon::BufferManager& buffers)
+{
+    const int inputC = mInputDims.d[0];
+    const int inputH = mInputDims.d[1];
+    const int inputW = mInputDims.d[2];
+    const int batchSize = mParams.batchSize;
+
+    // Available images
+    std::vector<std::string> imageListCandidates = {"001763.ppm", "004545.ppm"};
+    std::vector<std::string> imageList;
+    for (int i = 0; i < batchSize; i++)
+    {
+        imageList.push_back(imageListCandidates[i % 2]);
+    }
+
+    mPPMs.resize(batchSize);
+    mOriginalPPMs.resize(batchSize);
+    assert(mPPMs.size() <= imageList.size());
+    for (int i = 0; i < batchSize; ++i)
+    {
+        MaskRCNNUtils::readPPMFile(locateFile(imageList[i], mParams.dataDirs), mOriginalPPMs[i]);
+        MaskRCNNUtils::preprocessPPM(mOriginalPPMs[i], mPPMs[i], inputH, inputW);
+    }
+
+    float* hostDataBuffer = static_cast<float*>(buffers.getHostBuffer(mParams.inputTensorNames[0]));
+    float pixelMean[3]{123.7, 116.8, 103.9};
+    // Host memory for input buffer
+    for (int i = 0, volImg = inputC * inputH * inputW; i < mParams.batchSize; ++i)
+    {
+        for (int c = 0; c < inputC; ++c)
+        {
+            // The color image to input should be in RGB order
+            for (unsigned j = 0, volChl = inputH * inputW; j < volChl; ++j)
+            {
+                hostDataBuffer[i * volImg + c * volChl + j] = float(mPPMs[i].buffer[j * inputC + c]) - pixelMean[c];
+            }
+        }
+    }
+
+    return true;
+}
+
+vector<MaskRCNNUtils::BBoxInfo> SampleMaskRCNN::decodeOutput(const int imageIdx, void* detectionsHost, void* masksHost)
+{
+    int input_dim_h = MaskRCNNConfig::IMAGE_SHAPE.d[1], input_dim_w = MaskRCNNConfig::IMAGE_SHAPE.d[2];
+    assert(input_dim_h == input_dim_w);
+    int image_height = mOriginalPPMs[imageIdx].h;
+    int image_width = mOriginalPPMs[imageIdx].w;
+    // resize the DsImage with scale
+    const int image_dim = std::max(image_height, image_width);
+    int resizeH = (int) image_height * input_dim_h / (float) image_dim;
+    int resizeW = (int) image_width * input_dim_w / (float) image_dim;
+    // keep accurary from (float) to (int), then to float
+    float window_x = (1.0f - (float) resizeW / input_dim_w) / 2.0f;
+    float window_y = (1.0f - (float) resizeH / input_dim_h) / 2.0f;
+    float window_width = (float) resizeW / input_dim_w;
+    float window_height = (float) resizeH / input_dim_h;
+
+    float final_ratio_x = (float) image_width / window_width;
+    float final_ratio_y = (float) image_height / window_height;
+
+    std::vector<MaskRCNNUtils::BBoxInfo> binfo;
+
+    int detectionOffset = samplesCommon::volume(MaskRCNNConfig::MODEL_DETECTION_SHAPE); // (100,6)
+    int maskOffset = samplesCommon::volume(MaskRCNNConfig::MODEL_MASK_SHAPE);           // (100, 81, 28, 28)
+
+    MaskRCNNUtils::RawDetection* detections
+        = reinterpret_cast<MaskRCNNUtils::RawDetection*>((float*) detectionsHost + imageIdx * detectionOffset);
+    MaskRCNNUtils::Mask* masks = reinterpret_cast<MaskRCNNUtils::Mask*>((float*) masksHost + imageIdx * maskOffset);
+    for (int det_id = 0; det_id < MaskRCNNConfig::DETECTION_MAX_INSTANCES; det_id++)
+    {
+        MaskRCNNUtils::RawDetection cur_det = detections[det_id];
+        int label = (int) cur_det.class_id;
+        if (label <= 0)
+            continue;
+
+        MaskRCNNUtils::BBoxInfo det;
+        det.label = label;
+        det.prob = cur_det.score;
+
+        det.box.x1 = std::min(std::max((cur_det.x1 - window_x) * final_ratio_x, 0.0f), (float) image_width);
+        det.box.y1 = std::min(std::max((cur_det.y1 - window_y) * final_ratio_y, 0.0f), (float) image_height);
+        det.box.x2 = std::min(std::max((cur_det.x2 - window_x) * final_ratio_x, 0.0f), (float) image_width);
+        det.box.y2 = std::min(std::max((cur_det.y2 - window_y) * final_ratio_y, 0.0f), (float) image_height);
+
+        if (det.box.x2 <= det.box.x1 || det.box.y2 <= det.box.y1)
+            continue;
+
+        det.mask = masks + det_id * MaskRCNNConfig::NUM_CLASSES + label;
+
+        binfo.push_back(det);
+    }
+
+    return binfo;
+}
+
+bool SampleMaskRCNN::verifyOutput(const samplesCommon::BufferManager& buffers)
+{
+    void* detectionsHost = buffers.getHostBuffer(mParams.outputTensorNames[0]);
+    void* masksHost = buffers.getHostBuffer(mParams.outputTensorNames[1]);
+
+    bool pass = true;
+
+    for (int p = 0; p < mParams.batchSize; ++p)
+    {
+        vector<MaskRCNNUtils::BBoxInfo> binfo = decodeOutput(p, detectionsHost, masksHost);
+        for (size_t roi_id = 0; roi_id < binfo.size(); roi_id++)
+        {
+            const auto resized_mask = MaskRCNNUtils::resizeMask(binfo[roi_id], mParams.maskThreshold); // mask threshold
+            MaskRCNNUtils::addBBoxPPM(mOriginalPPMs[p], binfo[roi_id], resized_mask);
+
+            gLogInfo << "Detected " << MaskRCNNConfig::CLASS_NAMES[binfo[roi_id].label] << " in"
+                     << mOriginalPPMs[p].fileName << " with confidence " << binfo[roi_id].prob * 100.f
+                     << " and coordinates (" << binfo[roi_id].box.x1 << ", " << binfo[roi_id].box.y1 << ", "
+                     << binfo[roi_id].box.x2 << ", " << binfo[roi_id].box.y2 << ")" << std::endl;
+        }
+        gLogInfo << "The results are stored in current directory: " << std::to_string(p) + ".ppm" << std::endl;
+        MaskRCNNUtils::writePPMFile(std::to_string(p) + ".ppm", mOriginalPPMs[p]);
+    }
+
+    return pass;
+}
+
+SampleMaskRCNNParams initializeSampleParams(const samplesCommon::Args& args)
+{
+    SampleMaskRCNNParams params;
+    if (args.dataDirs.empty())
+    {
+        params.dataDirs.push_back("data/maskrcnn/");
+        params.dataDirs.push_back("data/maskrcnn/images/");
+        params.dataDirs.push_back("data/samples/maskrcnn/");
+        params.dataDirs.push_back("data/samples/maskrcnn/images/");
+    }
+    else
+    {
+        params.dataDirs = args.dataDirs;
+    }
+
+    params.inputTensorNames.push_back(MaskRCNNConfig::MODEL_INPUT);
+    params.batchSize = args.batch;
+    params.outputTensorNames.push_back(MaskRCNNConfig::MODEL_OUTPUTS[0]);
+    params.outputTensorNames.push_back(MaskRCNNConfig::MODEL_OUTPUTS[1]);
+    params.dlaCore = args.useDLACore;
+    params.int8 = args.runInInt8;
+    params.fp16 = args.runInFp16;
+
+    params.uffFileName = MaskRCNNConfig::MODEL_NAME;
+    params.maskThreshold = MaskRCNNConfig::MASK_THRESHOLD;
+
+    return params;
+}
+
+void printHelpInfo()
+{
+    std::cout << "Usage: ./sample_maskRCNN [-h or --help] [-d or --datadir=<path to data directory>]" << std::endl;
+    std::cout << "--help          Display help information" << std::endl;
+    std::cout << "--datadir       Specify path to a data directory, overriding the default. This option can be used "
+                 "multiple times to add multiple directories. If no data directories are given, the default is to use "
+                 "data/samples/maskrcnn/ and data/maskrcnn/"
+              << std::endl;
+    std::cout << "--fp16          Specify to run in fp16 mode." << std::endl;
+    std::cout << "--batch         Specify inference batch size." << std::endl;
+}
+
+int main(int argc, char** argv)
+{
+    samplesCommon::Args args;
+    bool argsOK = samplesCommon::parseArgs(args, argc, argv);
+    if (!argsOK)
+    {
+        gLogError << "Invalid arguments" << std::endl;
+        printHelpInfo();
+        return EXIT_FAILURE;
+    }
+    if (args.help)
+    {
+        printHelpInfo();
+        return EXIT_SUCCESS;
+    }
+
+    auto sampleTest = gLogger.defineTest(gSampleName, argc, argv);
+
+    gLogger.reportTestStart(sampleTest);
+
+    SampleMaskRCNN sample(initializeSampleParams(args));
+
+    gLogInfo << "Building and running a GPU inference engine for Mask RCNN" << std::endl;
+
+    if (!sample.build())
+    {
+        return gLogger.reportFail(sampleTest);
+    }
+    if (!sample.infer())
+    {
+        return gLogger.reportFail(sampleTest);
+    }
+    if (!sample.teardown())
+    {
+        return gLogger.reportFail(sampleTest);
+    }
+
+    return gLogger.reportPass(sampleTest);
+}
diff --git a/samples/opensource/sampleUffPluginV2Ext/CMakeLists.txt b/samples/opensource/sampleUffPluginV2Ext/CMakeLists.txt
new file mode 100755
index 00000000..23e4c591
--- /dev/null
+++ b/samples/opensource/sampleUffPluginV2Ext/CMakeLists.txt
@@ -0,0 +1,22 @@
+#
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+SET(SAMPLE_SOURCES
+    sampleUffPluginV2Ext.cpp
+)
+
+set(SAMPLE_PARSERS "uff")
+
+include(../../CMakeSamplesTemplate.txt)
diff --git a/samples/opensource/sampleUffPluginV2Ext/README.md b/samples/opensource/sampleUffPluginV2Ext/README.md
new file mode 100755
index 00000000..1e21bc5d
--- /dev/null
+++ b/samples/opensource/sampleUffPluginV2Ext/README.md
@@ -0,0 +1,305 @@
+# Adding A Custom Layer That Supports INT8 I/O To Your Network In TensorRT
+
+**Table of Contents**
+-  [Description](#description)
+-  [How does this sample work?](#how-does-this-sample-work)
+    * [Define layer outputs](#define-layer-outputs)
+    * [Restrict supported I/O format and data type](#restrict-supported-io-format-and-data-type)
+    * [Store information for layer execution](#store-information-for-layer-execution)
+    * [Serialize and deserialize the engine](#serialize-and-deserialize-the-engine)
+    * [Implement execution](#implement-execution)
+    * [Manage resources](#manage-resources)
+-  [Running the sample](#running-the-sample)
+    * [Sample `--help` options](#sample---help-options)
+-  [Additional resources](#additional-resources)
+-  [License](#license)
+-  [Changelog](#changelog)
+-  [Known issues](#known-issues)
+
+## Description
+
+This sample, `sampleUffPluginV2Ext`, implements the custom pooling layer for the MNIST model
+(`data/samples/lenet5_custom_pool.uff`). Since the cuDNN function `cudnnPoolingForward` with float precision is used to
+simulate an INT8 kernel, the performance for INT8 precision does not speed up. Nevertheless, the main purpose of this
+sample is to demonstrate how to extend INT8 I/O for a plugin that is introduced in TensorRT 6.0. This requires the
+interface replacement from `IPlugin/IPluginV2/IPluginV2Ext` to `IPluginV2IOExt` (or `IPluginV2DynamicExt` if dynamic
+shape is required).
+
+## How does this sample work?
+
+Specifically, this sample illustrates how to:
+- [Define layer outputs](#define-layer-outputs)
+- [Restrict supported I/O format and data type](#restrict-supported-io-format-and-data-type)
+- [Store information for layer execution](#store-information-for-layer-execution)
+- [Serialize and deserialize the engine](#serialize-and-deserialize-the-engine)
+- [Implement execution](#implement-execution)
+- [Manage resources](#manage-resources)
+
+### Define layer outputs
+
+`UffPoolPluginV2` implements the pooling layer which has a single output. Accordingly, the overridden
+`IPluginV2IOExt::getNbOutputs` returns `1` and `IPluginV2IOExt::getOutputDimensions` includes validation checks and
+returns the dimensions of the output.
+
+```
+    Dims UffPoolPluginV2::getOutputDimensions(int index, const Dims* inputs, int nbInputDims)
+    {
+        assert(index == 0 && nbInputDims == 1 && inputs[0].nbDims == 3);
+        int height = (inputs[0].d[1] + mPoolingParams.pH * 2 - mPoolingParams.mR) / mPoolingParams.mU + 1;
+        int width = (inputs[0].d[2] + mPoolingParams.pW * 2 - mPoolingParams.mS) / mPoolingParams.mV + 1;
+        DimsHW outDims(height, width);
+        return Dims3(inputs[0].d[0], outDims.h(), outDims.w());
+    }
+```
+
+### Restrict supported I/O format and data type
+
+The builder of TensorRT will ask for supported formats by the `IPluginV2IOExt::supportsFormatCombination` method to give
+it a chance to select a reasonable algorithm based on its I/O tensor description indexed by `pos`. In this sample, the
+supported I/O tensor format is linear CHW while Int32 is excluded, but the I/O tensor must have the same data type. For
+a more complex case, refer to [IPluginV2IOExt::supportsFormatCombination()](https://docs.nvidia.com/deeplearning/sdk/tensorrt-api/c_api/classnvinfer1_1_1_i_plugin_v2_i_o_ext.html#a72f5170d7f1043d40e3c8b90b7b2f2f0)
+in the API documentation for more details.
+
+```
+    bool UffPoolPluginV2::supportsFormatCombination(
+        int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const
+    {
+        ...
+        bool condition = inOut[pos].format == TensorFormat::kLINEAR;
+        condition &= inOut[pos].type != DataType::kINT32;
+        condition &= inOut[pos].type == inOut[0].type;
+        return condition;
+    }
+```
+
+### Store information for layer execution
+
+TensorRT will invoke `IPluginV2IOExt::configurePlugin` method to pass the information to the plugin through
+`PluginTensorDesc`, which are stored as member variables, serialized and deserialized if they are required by the layer
+execution.
+
+```
+    void UffPoolPluginV2::configurePlugin(
+        const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput)
+    {
+        ...
+        mDataType = in[0].type;
+        mInputDims = in[0].dims;
+        mOutputDims = out[0].dims;
+        mPoolingParams.mC = mInputDims.d[0];
+        mPoolingParams.mH = mInputDims.d[1];
+        mPoolingParams.mW = mInputDims.d[2];
+        mPoolingParams.mP = mOutputDims.d[1];
+        mPoolingParams.mQ = mOutputDims.d[2];
+        mInHostScale = in[0].scale >= 0.0f ? in[0].scale : -1.0f;
+        mOutHostScale = out[0].scale >= 0.0f ? out[0].scale : -1.0f;
+    }
+```
+
+### Serialize and deserialize the engine
+
+Fully compliant plugins support serialization and deserialization, as described in
+[Serializing A Model In C++](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#serial_model_c).
+In this sample, `UffPoolPluginV2` stores `PoolParameters`, I/O tensor dimensions, data types and optional INT8 scales.
+The size of these variables is returned by `IPluginV2IOExt::getSerializationSize`.
+
+```
+    size_t UffPoolPluginV2::getSerializationSize() const
+    {
+        size_t serializationSize = 0;
+        serializationSize += sizeof(mPoolingParams);
+        serializationSize += sizeof(mInputDims.nbDims);
+        serializationSize += sizeof(mInputDims.d[0]) * mInputDims.nbDims;
+        serializationSize += sizeof(mOutputDims.nbDims);
+        serializationSize += sizeof(mOutputDims.d[0]) * mOutputDims.nbDims;
+        serializationSize += sizeof(static_cast<int>(mDataType));
+        if (mDataType == DataType::kINT8)
+        {
+            serializationSize += sizeof(float) * 2;
+        }
+        return serializationSize;
+    }
+```
+
+Eventually, when the engine is serialized, these variables are written to a buffer:
+
+```
+    void UffPoolPluginV2::serialize(void* buffer) const
+    {
+        char* d = static_cast<char*>(buffer);
+        const char* const a = d;
+        write(d, mPoolingParams);
+        write(d, mInputDims.nbDims);
+        ...
+    }
+```
+
+Then, when the engine is deployed, it is deserialized by `UffPoolPluginV2Creator::deserializePlugin`.
+
+```
+    IPluginV2* UffPoolPluginV2Creator::deserializePlugin(
+        const char* name, const void* serialData, size_t serialLength)
+    {
+        auto plugin = new UffPoolPluginV2(serialData, serialLength);
+        mPluginName = name;
+        return plugin;
+    }
+```
+
+In the same order as in the serialization, the variables are read and their values are restored.
+
+```
+    UffPoolPluginV2::UffPoolPluginV2(const void* data, size_t length)
+    {
+        const char* const d = static_cast<const char*>(data);
+        const char* const a = d;
+        mPoolingParams = read<PoolParameters>(d);
+        mInputDims.nbDims = read<int>(d);
+        ...
+    }
+```
+
+### Implement execution
+
+TensorRT will invoke `IPluginV2::enqueue` which includes a collection of core algorithms of the plugin to execute the
+custom layer at runtime. The execution uses the input parameters including the actual batch size, inputs, outputs, cuDNN
+stream and the information configured.
+
+```
+    int UffPoolPluginV2::enqueue(
+        int batchSize, const void* const* inputs, void** outputs, void* workspace, cudaStream_t stream)
+    {
+        ...
+        CHECK(cudnnPoolingForward(mCudnn, mPoolingDesc, &kONE, mSrcDescriptor, input, &kZERO, mDstDescriptor, output));
+        ...
+        return 0;
+    }
+```
+
+### Manage resources
+
+TensorRT will guanturee that `IPluginV2IOExt::initialize` and `IPluginV2IOExt::terminate` are invoked in pairs for
+resource allocation and deallocation. In this sample, the overridden method `UffPoolPluginV2::initialize` creates the
+required cuDNN handle and sets up tensor descriptors. Conversely, `UffPoolPluginV2::terminate` destroys the handle and
+tensor descriptors.
+
+```
+    int UffPoolPluginV2::initialize()
+    {
+        CHECK(cudnnCreate(&mCudnn));
+        CHECK(cudnnCreateTensorDescriptor(&mSrcDescriptor));
+        CHECK(cudnnCreateTensorDescriptor(&mDstDescriptor));
+        CHECK(cudnnCreatePoolingDescriptor(&mPoolingDesc));
+        CHECK(cudnnSetPooling2dDescriptor(mPoolingDesc, mMode, CUDNN_NOT_PROPAGATE_NAN, mPoolingParams.mR,
+            mPoolingParams.mS, mPoolingParams.pH, mPoolingParams.pW, mPoolingParams.mU, mPoolingParams.mV));
+        return 0;
+    }
+```
+
+```
+    void UffPoolPluginV2::terminate()
+    {
+        CHECK(cudnnDestroyTensorDescriptor(mSrcDescriptor));
+        CHECK(cudnnDestroyTensorDescriptor(mDstDescriptor));
+        CHECK(cudnnDestroyPoolingDescriptor(mPoolingDesc));
+        CHECK(cudnnDestroy(mCudnn));
+    }
+```
+
+The plugin object created in the sample is cloned by each of the network, builder, and engine by calling the
+`IPluginV2IOExt::clone` method which calls the plugin constructor and can also clone plugin parameters, if necessary.
+
+```
+    IPluginV2Ext* UffPoolPluginV2::clone() const
+    {
+        auto* plugin = new UffPoolPluginV2(*this);
+        return plugin;
+    }
+```
+
+The cloned plugin objects are deleted when the network, builder, and engine are destroyed. This is done by invoking the
+`IPluginV2IOExt::destroy` method. The plugin object created by `UffPoolPluginV2Creator::createPlugin` is also destroyed
+by calling this method when the engine is destroyed.
+
+```
+    void destroy() override
+    {
+        delete this;
+    }
+```
+
+## Running the sample
+
+1. Compile this sample by running `make` in the `<TensorRT root directory>/samples/sampleUffPluginV2Ext` directory. The
+binary named `sample_uff_plugin_v2_ext` will be created in the `<TensorRT root directory>/bin` directory.
+
+```
+    cd <TensorRT root directory>/samples/sampleUffPluginV2Ext
+    make
+```
+Where `<TensorRT root directory>` is where you installed TensorRT.
+
+2. Run inference on the digit looping from 0 to 9:
+
+```
+    ./sample_uff_plugin_v2_ext
+```
+
+3. Verify that all the 10 digits match properly. If the sample runs successfully you should see output similar to the
+following.
+
+```
+    &&&& RUNNING TensorRT.sample_uff_plugin_v2_ext # ./sample_uff_plugin_v2_ext
+    [I] ../../../../../data/samples/mnist/lenet5_custom_pool.uff
+    [I] [TRT] Detected 1 input and 1 output network tensors.
+    [I] Input:
+    ... (omitted messages)
+    [I] Average over 10 runs is 0.10516 ms.
+    &&&& PASSED TensorRT.sample_uff_plugin_v2_ext # ./sample_uff_plugin_v2_ext
+```
+This output shows that the sample ran successfully; `PASSED`.
+
+### Sample `--help` options
+
+To see the full list of available options and their descriptions, use the `-h` or `--help` command line option.
+
+```
+    ./sample_uff_plugin_v2_ext --help
+    Usage: ./sample_uff_plugin_v2_ext [-h or --help] [-d or --datadir=<path to data directory>] [--useDLACore=<int>]
+    --help Display help information
+    --datadir Specify path to a data directory, overriding the default. This option can be used multiple times to add multiple
+    directories. If no data directories are given, the default is to use (data/samples/mnist/, data/mnist/)
+    --useDLACore=N Specify a DLA engine for layers that support DLA. Value can range from 0 to n-1, where n is the number of DLA
+    engines on the platform.
+    --int8 Run in Int8 mode.
+    --fp16 Run in FP16 mode.
+```
+
+## Additional resources
+
+The following resources provide a deeper understanding of sampleUffPluginV2Ext:
+
+**Models**
+
+-  [Training LeNet on MNIST with Caffe](http://caffe.berkeleyvision.org/gathered/examples/mnist.html)
+-  [lenet.prototxt](https://github.com/BVLC/caffe/blob/master/examples/mnist/lenet.prototxt)
+
+**Documentation**
+
+-  [Introduction To NVIDIA’s TensorRT Samples](https://docs.nvidia.com/deeplearning/sdk/tensorrt-sample-support-guide/index.html#samples)
+-  [Working With TensorRT Using The C++ API](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#c_topics)
+-  [NVIDIA’s TensorRT Documentation Library](https://docs.nvidia.com/deeplearning/sdk/tensorrt-archived/index.html)
+
+## License
+
+For terms and conditions for use, reproduction, and distribution, see the
+[TensorRT Software License Agreement](https://docs.nvidia.com/deeplearning/sdk/tensorrt-sla/index.html) documentation.
+
+## Changelog
+
+June 2019
+This is the initial open source release for this sample.
+
+## Known issues
+
+There are no known issues in this sample.
diff --git a/samples/opensource/sampleUffPluginV2Ext/sampleUffPluginV2Ext.cpp b/samples/opensource/sampleUffPluginV2Ext/sampleUffPluginV2Ext.cpp
new file mode 100644
index 00000000..96301dd6
--- /dev/null
+++ b/samples/opensource/sampleUffPluginV2Ext/sampleUffPluginV2Ext.cpp
@@ -0,0 +1,757 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "NvInfer.h"
+#include "NvUffParser.h"
+#include <cassert>
+#include <chrono>
+#include <cudnn.h>
+#include <iostream>
+#include <map>
+#include <string.h>
+#include <unordered_map>
+#include <vector>
+
+#include "NvUtils.h"
+#include "argsParser.h"
+#include "common.h"
+#include "half.h"
+#include "logger.h"
+
+using namespace nvuffparser;
+using namespace nvinfer1;
+using namespace samplesCommon;
+
+const std::string gSampleName = "TensorRT.sample_uff_plugin_v2_ext";
+samplesCommon::Args gArgs;
+
+template <DataType in, DataType out>
+void transform(const void* src, void* dst, int count)
+{
+    assert(in == out);
+    memcpy(dst, src, count * elementSize(in));
+}
+
+template <>
+void transform<DataType::kHALF, DataType::kFLOAT>(const void* src, void* dst, int count)
+{
+    auto srcPtr = static_cast<const half_float::half*>(src);
+    auto dstPtr = static_cast<float*>(dst);
+    std::transform(srcPtr, srcPtr + count, dstPtr, [](half_float::half in) { return static_cast<float>(in); });
+}
+
+template <>
+void transform<DataType::kINT8, DataType::kFLOAT>(const void* src, void* dst, int count)
+{
+    auto srcPtr = static_cast<const int8_t*>(src);
+    auto dstPtr = static_cast<float*>(dst);
+    std::transform(srcPtr, srcPtr + count, dstPtr, [](int8_t in) { return static_cast<float>(in); });
+}
+
+template <>
+void transform<DataType::kFLOAT, DataType::kHALF>(const void* src, void* dst, int count)
+{
+    auto srcPtr = static_cast<const float*>(src);
+    auto dstPtr = static_cast<half_float::half*>(dst);
+    std::transform(srcPtr, srcPtr + count, dstPtr, [](float in) { return static_cast<half_float::half>(in); });
+}
+
+template <>
+void transform<DataType::kFLOAT, DataType::kINT8>(const void* src, void* dst, int count)
+{
+    auto srcPtr = static_cast<const float*>(src);
+    auto dstPtr = static_cast<int8_t*>(dst);
+    std::transform(srcPtr, srcPtr + count, dstPtr, [](float x) {
+        x = std::max(x, float(INT8_MIN));
+        x = std::min(x, float(INT8_MAX));
+        return static_cast<int8_t>(x);
+    });
+}
+
+static const int INPUT_H = 28;
+static const int INPUT_W = 28;
+
+// simple PGM (portable greyscale map) reader
+void readPGMFile(const std::string& filename, uint8_t buffer[INPUT_H * INPUT_W])
+{
+    readPGMFile(locateFile(filename, gArgs.dataDirs), buffer, INPUT_H, INPUT_W);
+}
+
+std::vector<std::pair<size_t, DataType>> calculateBindingBufferSizes(
+    const ICudaEngine& engine, int nbBindings, int batchSize)
+{
+    std::vector<std::pair<size_t, DataType>> sizes;
+    for (int i = 0; i < nbBindings; ++i)
+    {
+        Dims dims = engine.getBindingDimensions(i);
+        DataType dtype = engine.getBindingDataType(i);
+
+        size_t eltCount = volume(dims) * batchSize;
+        sizes.push_back(std::make_pair(eltCount, dtype));
+    }
+    return sizes;
+}
+
+void* createMnistCudaBuffer(int64_t eltCount, DataType dtype, int num)
+{
+    // in that specific case, eltCount == INPUT_H * INPUT_W
+    assert(eltCount == INPUT_H * INPUT_W);
+    assert(elementSize(dtype) == sizeof(float));
+
+    size_t memSize = eltCount * elementSize(dtype);
+    std::vector<float> inputs(eltCount);
+
+    // read PGM file
+    uint8_t fileData[INPUT_H * INPUT_W];
+    readPGMFile(std::to_string(num) + ".pgm", fileData);
+
+    // display the number in an ascii representation
+    gLogInfo << "Input:\n";
+    for (int i = 0; i < eltCount; i++)
+    {
+        gLogInfo << (" .:-=+*#%@"[fileData[i] / 26]) << (((i + 1) % INPUT_W) ? "" : "\n");
+    }
+    gLogInfo << std::endl;
+
+    // initialize the inputs buffer
+    for (int i = 0; i < eltCount; i++)
+    {
+        inputs[i] = 1.0 - float(fileData[i]) / 255.0;
+    }
+
+    void* deviceMem = safeCudaMalloc(memSize);
+    CHECK(cudaMemcpy(deviceMem, inputs.data(), memSize, cudaMemcpyHostToDevice));
+
+    return deviceMem;
+}
+
+bool verifyOutput(int64_t eltCount, DataType dtype, void* buffer, int num)
+{
+    assert(elementSize(dtype) == sizeof(float));
+
+    bool pass = false;
+
+    size_t memSize = eltCount * elementSize(dtype);
+    std::vector<float> outputs(eltCount);
+    CHECK(cudaMemcpy(outputs.data(), buffer, memSize, cudaMemcpyDeviceToHost));
+
+    int maxIdx = std::distance(outputs.begin(), std::max_element(outputs.begin(), outputs.end()));
+
+    std::ios::fmtflags prevSettings = gLogInfo.flags();
+    gLogInfo.setf(std::ios::fixed, std::ios::floatfield);
+    gLogInfo.precision(6);
+    gLogInfo << "Output:\n";
+    for (int64_t eltIdx = 0; eltIdx < eltCount; ++eltIdx)
+    {
+        gLogInfo << eltIdx << " => " << std::setw(10) << outputs[eltIdx] << "\t : ";
+        if (eltIdx == maxIdx)
+        {
+            gLogInfo << "***";
+            pass = eltIdx == num ? true : false;
+        }
+        gLogInfo << "\n";
+    }
+    gLogInfo.flags(prevSettings);
+    gLogInfo << std::endl;
+    return pass;
+}
+
+struct PoolParameters
+{
+    // Input dimensions
+    int mC, mH, mW;
+    // Output dimensions
+    int mP, mQ;
+    // Kernel size
+    int mR, mS;
+    // Stride
+    int mU, mV;
+    // Padding
+    int pH, pW;
+    // Pooling Function
+    PoolingType pType;
+};
+
+class SampleUffPluginV2Ext
+{
+public:
+    template <typename T>
+    using SampleUniquePtr = std::unique_ptr<T, samplesCommon::InferDeleter>;
+
+    explicit SampleUffPluginV2Ext(const UffSampleParams& params)
+        : mParams(params)
+    {
+    }
+
+    //!
+    //! \brief Creates the network, configures the builder and creates the network engine
+    //!
+    bool build()
+    {
+        SampleUniquePtr<IUffParser> parser{createUffParser()};
+        parser->registerInput("in", Dims3(1, 28, 28), UffInputOrder::kNCHW);
+        parser->registerOutput("out");
+
+        SampleUniquePtr<IBuilder> builder{createInferBuilder(gLogger.getTRTLogger())};
+        if (!builder.get())
+        {
+            gLogError << "Failed to create infer builder. " << std::endl;
+            return false;
+        }
+
+        SampleUniquePtr<INetworkDefinition> network{builder->createNetwork()};
+        if (!network.get())
+        {
+            gLogError << "Failed to create network. " << std::endl;
+            return false;
+        }
+
+        if (!parser->parse(mParams.uffFileName.data(), *network, nvinfer1::DataType::kFLOAT))
+        {
+            gLogError << "Failure while parsing UFF file" << std::endl;
+            return false;
+        }
+
+        if (gArgs.runInInt8)
+        {
+            samplesCommon::setAllTensorScales(network.get(), 5.0f, 5.0f);
+        }
+
+        SampleUniquePtr<IBuilderConfig> networkConfig{builder->createBuilderConfig()};
+        networkConfig->setMaxWorkspaceSize(1_GiB);
+        if (gArgs.runInFp16)
+        {
+            networkConfig->setFlag(BuilderFlag::kFP16);
+        }
+        if (gArgs.runInInt8)
+        {
+            networkConfig->setFlag(BuilderFlag::kINT8);
+        }
+        networkConfig->setFlag(BuilderFlag::kSTRICT_TYPES);
+        if (gArgs.useDLACore >= 0)
+        {
+            networkConfig->setDLACore(gArgs.useDLACore);
+        }
+
+        const int maxBatchSize = 1;
+        builder->setMaxBatchSize(maxBatchSize);
+        samplesCommon::enableDLA(builder.get(), networkConfig.get(), gArgs.useDLACore);
+
+        mEngine.reset(builder->buildEngineWithConfig(*network, *networkConfig));
+        if (!mEngine.get())
+        {
+            gLogError << "Unable to create engine. " << std::endl;
+            return false;
+        }
+        return true;
+    }
+
+    //!
+    //! \brief Runs the TensorRT inference engine for this sample
+    //!
+    bool infer()
+    {
+        bool pass{true};
+        SampleUniquePtr<IExecutionContext> context{mEngine->createExecutionContext()};
+
+        const int batchSize{1};
+        const int nbBindings = mEngine->getNbBindings();
+        assert(nbBindings == 2);
+
+        std::vector<void*> buffers(nbBindings);
+        auto buffersSizes = calculateBindingBufferSizes(*mEngine, nbBindings, batchSize);
+
+        const int bindingIdxInput = mEngine->bindingIsInput(0) ? 0 : 1;
+        const int bindingIdxOutput = mEngine->bindingIsInput(0) ? 1 : 0;
+        auto bufferSizesOutput = buffersSizes[bindingIdxOutput];
+        buffers[bindingIdxOutput] = safeCudaMalloc(bufferSizesOutput.first * elementSize(bufferSizesOutput.second));
+
+        auto bufferSizesInput = buffersSizes[bindingIdxInput];
+
+        const int iterations{1};
+        const int numberRun{10};
+        for (int i = 0; i < iterations; i++)
+        {
+            float total{0.0f}, ms{0.0f};
+            for (int num = 0; num < numberRun; num++)
+            {
+                buffers[bindingIdxInput] = createMnistCudaBuffer(bufferSizesInput.first, bufferSizesInput.second, num);
+                auto t_start = std::chrono::high_resolution_clock::now();
+                context->execute(batchSize, &buffers[0]);
+                auto t_end = std::chrono::high_resolution_clock::now();
+                ms = std::chrono::duration<float, std::milli>(t_end - t_start).count();
+                total += ms;
+
+                for (int bindingIdx = 0; bindingIdx < nbBindings; ++bindingIdx)
+                {
+                    if (mEngine->bindingIsInput(bindingIdx))
+                    {
+                        continue;
+                    }
+                    auto bufferSizesOutput = buffersSizes[bindingIdx];
+                    pass &= verifyOutput(bufferSizesOutput.first, bufferSizesOutput.second, buffers[bindingIdx], num);
+                }
+                CHECK(cudaFree(buffers[bindingIdxInput]));
+            }
+            total /= numberRun;
+            gLogInfo << "Average over " << numberRun << " runs is " << total << " ms." << std::endl;
+        }
+
+        for (int bindingIdx = 0; bindingIdx < nbBindings; ++bindingIdx)
+        {
+            if (!mEngine->bindingIsInput(bindingIdx))
+            {
+                CHECK(cudaFree(buffers[bindingIdx]));
+            }
+        }
+        return pass;
+    }
+
+    //!
+    //! \brief Used to clean up any state created in the sample class
+    //!
+    bool teardown()
+    {
+        nvuffparser::shutdownProtobufLibrary();
+        return true;
+    }
+
+private:
+    SampleUniquePtr<nvinfer1::ICudaEngine> mEngine;
+    samplesCommon::UffSampleParams mParams;
+};
+
+class UffPoolPluginV2 : public IPluginV2IOExt
+{
+public:
+    UffPoolPluginV2(const PluginFieldCollection& fc)
+    {
+        // To do: TRT-TRT-8010 Populate Parameters from fc object w/ hard code
+        mPoolingParams.pType = PoolingType::kMAX;
+        mPoolingParams.mU = 2;
+        mPoolingParams.mV = 2;
+        mPoolingParams.mR = 2;
+        mPoolingParams.mS = 2;
+        mPoolingParams.pH = 0;
+        mPoolingParams.pW = 0;
+        mMode = CUDNN_POOLING_MAX;
+        (void) fc;
+    }
+
+    UffPoolPluginV2(const void* data, size_t length)
+    {
+        const char* d = static_cast<const char*>(data);
+        const char* const a = d;
+        mPoolingParams = read<PoolParameters>(d);
+        mInputDims.nbDims = read<int>(d);
+        for (int i = 0; i < mInputDims.nbDims; ++i)
+        {
+            mInputDims.d[i] = read<int>(d);
+        }
+        mOutputDims.nbDims = read<int>(d);
+        for (int i = 0; i < mOutputDims.nbDims; ++i)
+        {
+            mOutputDims.d[i] = read<int>(d);
+        }
+        mDataType = static_cast<DataType>(read<int>(d));
+        mMode = mPoolingParams.pType == PoolingType::kMAX ? CUDNN_POOLING_MAX
+                                                          : CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
+        if (mDataType == DataType::kINT8)
+        {
+            mInHostScale = read<float>(d);
+            mOutHostScale = read<float>(d);
+        }
+        assert(d == a + length);
+    }
+
+    // It makes no sense to construct UffPoolPluginV2 without arguments.
+    UffPoolPluginV2() = delete;
+
+    virtual ~UffPoolPluginV2() {}
+
+public:
+    int getNbOutputs() const override
+    {
+        return 1;
+    }
+
+    Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override
+    {
+        assert(index == 0 && nbInputDims == 1 && inputs[0].nbDims == 3);
+        int height = (inputs[0].d[1] + mPoolingParams.pH * 2 - mPoolingParams.mR) / mPoolingParams.mU + 1;
+        int width = (inputs[0].d[2] + mPoolingParams.pW * 2 - mPoolingParams.mS) / mPoolingParams.mV + 1;
+        DimsHW outDims(height, width);
+        return Dims3(inputs[0].d[0], outDims.h(), outDims.w());
+    }
+
+    int initialize() override
+    {
+        CHECK(cudnnCreate(&mCudnn));
+        CHECK(cudnnCreateTensorDescriptor(&mSrcDescriptor));
+        CHECK(cudnnCreateTensorDescriptor(&mDstDescriptor));
+        CHECK(cudnnCreatePoolingDescriptor(&mPoolingDesc));
+        CHECK(cudnnSetPooling2dDescriptor(mPoolingDesc, mMode, CUDNN_NOT_PROPAGATE_NAN, mPoolingParams.mR,
+            mPoolingParams.mS, mPoolingParams.pH, mPoolingParams.pW, mPoolingParams.mU, mPoolingParams.mV));
+        return 0;
+    }
+
+    void terminate() override
+    {
+        CHECK(cudnnDestroyTensorDescriptor(mSrcDescriptor));
+        CHECK(cudnnDestroyTensorDescriptor(mDstDescriptor));
+        CHECK(cudnnDestroyPoolingDescriptor(mPoolingDesc));
+        CHECK(cudnnDestroy(mCudnn));
+    }
+
+    size_t getWorkspaceSize(int maxBatchSize) const override
+    {
+        return 0;
+    }
+
+    int enqueue(int batchSize, const void* const* inputs, void** outputs, void* workspace, cudaStream_t stream) override
+    {
+        const float kONE = 1.0f, kZERO = 0.0f;
+        cudnnSetStream(mCudnn, stream);
+
+        const int N = 1;
+        // Use float to simulate int8 calculation
+        std::map<DataType, cudnnDataType_t> typeMap = {{DataType::kFLOAT, CUDNN_DATA_FLOAT},
+            {DataType::kHALF, CUDNN_DATA_HALF}, {DataType::kINT8, CUDNN_DATA_FLOAT}};
+        assert(mDataType != DataType::kINT32);
+        CHECK(cudnnSetTensor4dDescriptor(mSrcDescriptor, CUDNN_TENSOR_NCHW, typeMap[mDataType], N, mPoolingParams.mC,
+            mPoolingParams.mH, mPoolingParams.mW));
+        CHECK(cudnnSetTensor4dDescriptor(mDstDescriptor, CUDNN_TENSOR_NCHW, typeMap[mDataType], N, mPoolingParams.mC,
+            mPoolingParams.mP, mPoolingParams.mQ));
+        void* input{nullptr};
+        void* output{nullptr};
+        if (mDataType == DataType::kINT8)
+        {
+            copyDeviceInputToFP32(inputs[0], input);
+            size_t outCount = getC(mOutputDims) * getH(mOutputDims) * getW(mOutputDims);
+            CHECK(cudaMalloc(&output, outCount * elementSize(DataType::kFLOAT)));
+        }
+        else
+        {
+            input = const_cast<void*>(inputs[0]);
+            output = const_cast<void*>(outputs[0]);
+        }
+        CHECK(cudnnPoolingForward(mCudnn, mPoolingDesc, &kONE, mSrcDescriptor, input, &kZERO, mDstDescriptor, output));
+        if (mDataType == DataType::kINT8)
+        {
+            copyDeviceToInt8Output(output, outputs[0]);
+        }
+        return 0;
+    }
+
+    size_t getSerializationSize() const override
+    {
+        size_t serializationSize = 0;
+        serializationSize += sizeof(mPoolingParams);
+        serializationSize += sizeof(mInputDims.nbDims);
+        serializationSize += sizeof(mInputDims.d[0]) * mInputDims.nbDims;
+        serializationSize += sizeof(mOutputDims.nbDims);
+        serializationSize += sizeof(mOutputDims.d[0]) * mOutputDims.nbDims;
+        serializationSize += sizeof(static_cast<int>(mDataType));
+        if (mDataType == DataType::kINT8)
+        {
+            serializationSize += sizeof(float) * 2;
+        }
+        return serializationSize;
+    }
+
+    void serialize(void* buffer) const override
+    {
+        char* d = static_cast<char*>(buffer);
+        const char* const a = d;
+        write(d, mPoolingParams);
+        write(d, mInputDims.nbDims);
+        assert(mInputDims.nbDims <= mInputDims.MAX_DIMS);
+        for (int i = 0; i < mInputDims.nbDims; ++i)
+        {
+            write(d, mInputDims.d[i]);
+        }
+        write(d, mOutputDims.nbDims);
+        assert(mOutputDims.nbDims <= mOutputDims.MAX_DIMS);
+        for (int i = 0; i < mOutputDims.nbDims; ++i)
+        {
+            write(d, mOutputDims.d[i]);
+        }
+        write(d, static_cast<int>(mDataType));
+        if (mDataType == DataType::kINT8)
+        {
+            write(d, mInHostScale);
+            write(d, mOutHostScale);
+        }
+        assert(d == a + getSerializationSize());
+    }
+
+    void configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) override
+    {
+        assert(in && nbInput == 1);
+        assert(out && nbOutput == 1);
+        assert(in[0].type == out[0].type);
+        assert(in[0].format == TensorFormat::kLINEAR && out[0].format == TensorFormat::kLINEAR);
+
+        mDataType = in[0].type;
+        mInputDims = in[0].dims;
+        mOutputDims = out[0].dims;
+        mPoolingParams.mC = mInputDims.d[0];
+        mPoolingParams.mH = mInputDims.d[1];
+        mPoolingParams.mW = mInputDims.d[2];
+        mPoolingParams.mP = mOutputDims.d[1];
+        mPoolingParams.mQ = mOutputDims.d[2];
+        mInHostScale = in[0].scale >= 0.0f ? in[0].scale : -1.0f;
+        mOutHostScale = out[0].scale >= 0.0f ? out[0].scale : -1.0f;
+    }
+
+    //! The combination of kLINEAR + kINT8/kHALF/kFLOAT is supported.
+    bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const override
+    {
+        assert(nbInputs == 1 && nbOutputs == 1 && pos < nbInputs + nbOutputs);
+        bool condition = inOut[pos].format == TensorFormat::kLINEAR;
+        condition &= inOut[pos].type != DataType::kINT32;
+        condition &= inOut[pos].type == inOut[0].type;
+        return condition;
+    }
+
+    DataType getOutputDataType(int index, const DataType* inputTypes, int nbInputs) const override
+    {
+        assert(inputTypes && nbInputs == 1);
+        (void) index;
+        return inputTypes[0];
+    }
+
+    const char* getPluginType() const override
+    {
+        return "MaxPool";
+    }
+
+    const char* getPluginVersion() const override
+    {
+        return "2";
+    }
+
+    void destroy() override
+    {
+        delete this;
+    }
+
+    IPluginV2Ext* clone() const override
+    {
+        auto* plugin = new UffPoolPluginV2(*this);
+        return plugin;
+    }
+
+    void setPluginNamespace(const char* libNamespace) override
+    {
+        mNamespace = libNamespace;
+    }
+
+    const char* getPluginNamespace() const override
+    {
+        return mNamespace.data();
+    }
+
+    bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const override
+    {
+        return false;
+    }
+
+    bool canBroadcastInputAcrossBatch(int inputIndex) const override
+    {
+        return false;
+    }
+
+private:
+    template <typename T>
+    void write(char*& buffer, const T& val) const
+    {
+        *reinterpret_cast<T*>(buffer) = val;
+        buffer += sizeof(T);
+    }
+
+    template <typename T>
+    T read(const char*& buffer) const
+    {
+        T val = *reinterpret_cast<const T*>(buffer);
+        buffer += sizeof(T);
+        return val;
+    }
+
+    void copyDeviceInputToFP32(const void* src, void*& dst)
+    {
+        assert(mDataType == DataType::kINT8);
+        size_t inCount = getC(mInputDims) * getH(mInputDims) * getW(mInputDims);
+        std::unique_ptr<char> inputTmp{new char[inCount * elementSize(mDataType)]};
+        CHECK(cudaMemcpy(inputTmp.get(), src, inCount * elementSize(mDataType), cudaMemcpyDeviceToHost));
+        std::unique_ptr<float> inputFP32{new float[inCount]};
+        transform<DataType::kINT8, DataType::kFLOAT>(inputTmp.get(), inputFP32.get(), inCount);
+        // int8 scale
+        int hw = mInputDims.d[1] * mInputDims.d[2];
+        for (int j = 0; j < mInputDims.d[0]; ++j)
+        {
+            std::transform(inputFP32.get() + hw * j, inputFP32.get() + hw * (j + 1), inputFP32.get() + hw * j,
+                [&](float in) -> float { return in * mInHostScale; });
+        }
+        CHECK(cudaMalloc(&dst, inCount * elementSize(DataType::kFLOAT)));
+        CHECK(cudaMemcpy(dst, inputFP32.get(), inCount * elementSize(DataType::kFLOAT), cudaMemcpyHostToDevice));
+    }
+
+    void copyDeviceToInt8Output(const void* src, void* dst)
+    {
+        size_t outCount = getC(mOutputDims) * getH(mOutputDims) * getW(mOutputDims);
+        std::unique_ptr<float> outTmp{new float[outCount]};
+        CHECK(cudaMemcpy(outTmp.get(), src, outCount * elementSize(DataType::kFLOAT), cudaMemcpyDeviceToHost));
+        std::unique_ptr<char> outInt8{new char[outCount * elementSize(DataType::kINT8)]};
+        // int8 + scale
+        int hw = mOutputDims.d[1] * mOutputDims.d[2];
+        for (int j = 0; j < mInputDims.d[0]; ++j)
+        {
+            std::transform(outTmp.get() + hw * j, outTmp.get() + hw * (j + 1), outTmp.get() + hw * j,
+                [&](float in) -> float { return in / mOutHostScale; });
+        }
+        transform<DataType::kFLOAT, DataType::kINT8>(outTmp.get(), outInt8.get(), outCount);
+        CHECK(cudaMemcpy(dst, outInt8.get(), outCount, cudaMemcpyHostToDevice));
+    }
+
+private:
+    cudnnHandle_t mCudnn;
+    cudnnTensorDescriptor_t mSrcDescriptor, mDstDescriptor;
+    cudnnPoolingDescriptor_t mPoolingDesc;
+    PoolParameters mPoolingParams;
+    cudnnPoolingMode_t mMode;
+    DataType mDataType;
+
+    Dims mInputDims;
+    Dims mOutputDims;
+    float mInHostScale{-1.0f};
+    float mOutHostScale{-1.0f};
+    std::string mNamespace;
+};
+
+class UffPoolPluginV2Creator : public IPluginCreator
+{
+public:
+    const char* getPluginName() const override
+    {
+        return "MaxPool";
+    }
+
+    const char* getPluginVersion() const override
+    {
+        return "2";
+    }
+
+    const PluginFieldCollection* getFieldNames() override
+    {
+        return &mFieldCollection;
+    }
+
+    IPluginV2* createPlugin(const char* name, const PluginFieldCollection* fc) override
+    {
+        auto plugin = new UffPoolPluginV2(*fc);
+        mFieldCollection = *fc;
+        mPluginName = name;
+        return plugin;
+    }
+
+    IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) override
+    {
+        auto plugin = new UffPoolPluginV2(serialData, serialLength);
+        mPluginName = name;
+        return plugin;
+    }
+
+    void setPluginNamespace(const char* libNamespace) override
+    {
+        mNamespace = libNamespace;
+    }
+
+    const char* getPluginNamespace() const override
+    {
+        return mNamespace.c_str();
+    }
+
+private:
+    std::string mNamespace;
+    std::string mPluginName;
+    PluginFieldCollection mFieldCollection{0, nullptr};
+};
+
+REGISTER_TENSORRT_PLUGIN(UffPoolPluginV2Creator);
+
+// This function prints the help information for running this sample
+void printHelpInfo()
+{
+    std::cout << "Usage: ./sample_uff_plugin_v2_ext [-h or --help] [-d or --datadir=<path to data directory>] "
+                 "[--useDLACore=<int>]\n";
+    std::cout << "--help          Display help information\n";
+    std::cout << "--datadir       Specify path to a data directory, overriding the default. This option can be used "
+                 "multiple times to add multiple directories. If no data directories are given, the default is to use "
+                 "(data/samples/mnist/, data/mnist/)"
+              << std::endl;
+    std::cout << "--useDLACore=N  Specify a DLA engine for layers that support DLA. Value can range from 0 to n-1, "
+                 "where n is the number of DLA engines on the platform."
+              << std::endl;
+    std::cout << "--int8          Run in Int8 mode.\n";
+    std::cout << "--fp16          Run in FP16 mode.\n";
+}
+
+int main(int argc, char** argv)
+{
+    bool argsOK = samplesCommon::parseArgs(gArgs, argc, argv);
+    if (gArgs.help)
+    {
+        printHelpInfo();
+        return EXIT_SUCCESS;
+    }
+    if (!argsOK)
+    {
+        gLogError << "Invalid arguments" << std::endl;
+        printHelpInfo();
+        return EXIT_FAILURE;
+    }
+    if (gArgs.dataDirs.empty())
+    {
+        gArgs.dataDirs = std::vector<std::string>{"data/samples/mnist/", "data/mnist/"};
+    }
+
+    auto sampleTest = gLogger.defineTest(gSampleName, argc, argv);
+
+    gLogger.reportTestStart(sampleTest);
+
+    samplesCommon::UffSampleParams params;
+    params.uffFileName = locateFile("lenet5_custom_pool.uff", gArgs.dataDirs);
+    gLogInfo << params.uffFileName << std::endl;
+    SampleUffPluginV2Ext sample(params);
+
+    if (!sample.build())
+    {
+        return gLogger.reportFail(sampleTest);
+    }
+
+    if (!sample.infer())
+    {
+        return gLogger.reportFail(sampleTest);
+    }
+
+    if (!sample.teardown())
+    {
+        return gLogger.reportFail(sampleTest);
+    }
+
+    return gLogger.reportPass(sampleTest);
+}
diff --git a/samples/opensource/sampleUffSSD/sampleUffSSD.cpp b/samples/opensource/sampleUffSSD/sampleUffSSD.cpp
index be2911fe..5594f036 100644
--- a/samples/opensource/sampleUffSSD/sampleUffSSD.cpp
+++ b/samples/opensource/sampleUffSSD/sampleUffSSD.cpp
@@ -99,7 +99,8 @@ class SampleUffSSD
     //! \brief Parses an UFF model for SSD and creates a TensorRT network
     //!
     bool constructNetwork(SampleUniquePtr<nvinfer1::IBuilder>& builder,
-        SampleUniquePtr<nvinfer1::INetworkDefinition>& network, SampleUniquePtr<nvuffparser::IUffParser>& parser);
+        SampleUniquePtr<nvinfer1::INetworkDefinition>& network, SampleUniquePtr<nvinfer1::IBuilderConfig>& config,
+        SampleUniquePtr<nvuffparser::IUffParser>& parser);
 
     //!
     //! \brief Reads the input and mean data, preprocesses, and stores the result in a managed buffer
@@ -136,13 +137,19 @@ bool SampleUffSSD::build()
         return false;
     }
 
+    auto config = SampleUniquePtr<nvinfer1::IBuilderConfig>(builder->createBuilderConfig());
+    if (!config)
+    {
+        return false;
+    }
+
     auto parser = SampleUniquePtr<nvuffparser::IUffParser>(nvuffparser::createUffParser());
     if (!parser)
     {
         return false;
     }
 
-    auto constructed = constructNetwork(builder, network, parser);
+    auto constructed = constructNetwork(builder, network, config, parser);
     if (!constructed)
     {
         return false;
@@ -166,7 +173,8 @@ bool SampleUffSSD::build()
 //! \param builder Pointer to the engine builder
 //!
 bool SampleUffSSD::constructNetwork(SampleUniquePtr<nvinfer1::IBuilder>& builder,
-    SampleUniquePtr<nvinfer1::INetworkDefinition>& network, SampleUniquePtr<nvuffparser::IUffParser>& parser)
+    SampleUniquePtr<nvinfer1::INetworkDefinition>& network, SampleUniquePtr<nvinfer1::IBuilderConfig>& config,
+    SampleUniquePtr<nvuffparser::IUffParser>& parser)
 {
     parser->registerInput(mParams.inputTensorNames[0].c_str(), DimsCHW(3, 300, 300), nvuffparser::UffInputOrder::kNCHW);
     parser->registerOutput(mParams.outputTensorNames[0].c_str());
@@ -178,8 +186,11 @@ bool SampleUffSSD::constructNetwork(SampleUniquePtr<nvinfer1::IBuilder>& builder
     }
 
     builder->setMaxBatchSize(mParams.batchSize);
-    builder->setMaxWorkspaceSize(2_GB);
-    builder->setFp16Mode(mParams.fp16);
+    config->setMaxWorkspaceSize(1_GiB);
+    if (mParams.fp16)
+    {
+        config->setFlag(BuilderFlag::kFP16);
+    }
 
     // Calibrator life time needs to last until after the engine is built.
     std::unique_ptr<IInt8Calibrator> calibrator;
@@ -191,16 +202,18 @@ bool SampleUffSSD::constructNetwork(SampleUniquePtr<nvinfer1::IBuilder>& builder
         const int imageC = 3;
         const int imageH = 300;
         const int imageW = 300;
-        const nvinfer1::DimsNCHW imageDims{mParams.calBatchSize, imageC, imageH, imageW};
+        nvinfer1::DimsNCHW imageDims{};
+        imageDims = nvinfer1::DimsNCHW{mParams.calBatchSize, imageC, imageH, imageW};
         BatchStream calibrationStream(
             mParams.calBatchSize, mParams.nbCalBatches, imageDims, listFileName, mParams.dataDirs);
-        calibrator.reset(
-            new Int8EntropyCalibrator2(calibrationStream, 0, "UffSSD", mParams.inputTensorNames[0].c_str()));
-        builder->setInt8Mode(true);
-        builder->setInt8Calibrator(calibrator.get());
+        calibrator.reset(new Int8EntropyCalibrator2<BatchStream>(
+            calibrationStream, 0, "UffSSD", mParams.inputTensorNames[0].c_str()));
+        config->setFlag(BuilderFlag::kINT8);
+        config->setInt8Calibrator(calibrator.get());
     }
 
-    mEngine = std::shared_ptr<nvinfer1::ICudaEngine>(builder->buildCudaEngine(*network), samplesCommon::InferDeleter());
+    mEngine = std::shared_ptr<nvinfer1::ICudaEngine>(
+        builder->buildEngineWithConfig(*network, *config), samplesCommon::InferDeleter());
     if (!mEngine)
     {
         return false;
diff --git a/samples/opensource/trtexec/CMakeLists.txt b/samples/opensource/trtexec/CMakeLists.txt
index 33ca82c4..06bf41a6 100644
--- a/samples/opensource/trtexec/CMakeLists.txt
+++ b/samples/opensource/trtexec/CMakeLists.txt
@@ -14,6 +14,8 @@
 # limitations under the License.
 #
 SET(SAMPLE_SOURCES
+    ../../common/sampleEngines.cpp
+    ../../common/sampleOptions.cpp
     trtexec.cpp
 )
 
diff --git a/samples/opensource/trtexec/README.md b/samples/opensource/trtexec/README.md
index 727b062c..293f645c 100644
--- a/samples/opensource/trtexec/README.md
+++ b/samples/opensource/trtexec/README.md
@@ -74,71 +74,7 @@ For more information about DLA, see [Working With DLA](https://docs.nvidia.com/d
 
 ## Tool command line arguments
 
-To see the full list of available options and their descriptions, use the `./trtexec --help` command.
-```
-&&&& RUNNING TensorRT.trtexec # ./trtexec --help
-=== Model Options ===
-  --uff=<file>                UFF model
-  --onnx=<file>               ONNX model
-  --model=<file>              Caffe model (default = no model, random weights used)
-  --deploy=<file>             Caffe prototxt file
-  --output=<name>[,<name>]*   Output names (it can be specified multiple times); at least one output is required for UFF and Caffe
-  --uffInput=<name>,X,Y,Z     Input blob name and its dimensions (X,Y,Z=C,H,W), it can be specified multiple times; at least one is required for UFF models
-  --uffNHWC                   Set if inputs are in the NHWC layout instead of NCHW (use X,Y,Z=H,W,C order in --uffInput)
-
-=== Build Options ===
-  --minBatch=N                Set min batch size when building the engine (default = 1)
-  --optBatch=N                Set optimal batch size when building the engine (default = maxBatch size)
-  --maxBatch=N                Set max batch size when building the engine (default = minBatch size)
-  --dynamicShapes             Enable dynamic shape for lower 3 (CWH or HWC) dimensions on inputs
-  --inputIOFormats=spec       Type and formats of the input tensors (default = all inputs in fp32:chw)
-  --outputIOFormats=spec      Type and formats of the output tensors (default = all outputs in fp32:chw)
-                              IO Formats: spec  ::= IOfmt[","spec]
-                                          IOfmt ::= type:fmt
-                                          type  ::= "fp32"|"fp16"|"int32"|"int8"
-                                          fmt   ::= ("chw"|"chw2"|"chw4"|"hwc8"|"chw16"|"chw32")["+"fmt]
-  --workspace=N               Set workspace size in megabytes (default = 16)
-  --minTiming=M               Set the minimum number of iterations used in kernel selection (default = 1)
-  --avgTiming=M               Set the number of times averaged in each iteration for kernel selection (default = 8)
-  --fp16                      Enable fp16 mode (default = disabled)
-  --int8                      Run in int8 mode (default = disabled)
-  --calib=<file>              Read INT8 calibration cache file
-  --safe                      Only test the functionality available in safety restricted flows
-  --saveEngine=<file>         Save the serialized engine
-  --loadEngine=<file>         Load a serialized engine
-
-=== Inference Options ===
-  --batch=N                   Set batch size (default = 1)
-  --iterations=N              Run N inference iterations (default = 10)
-  --warmUp=N                  Run for N milliseconds to warmup before measuring performance (default = 200)
-  --duration=N                Run performance measurements for at least N seconds of wallclock time (default = 10)
-  --sleepTime=N               Delay inference start with a gap of N milliseconds between launch and compute (default = 0)
-  --streams=N                 Instantiate N engines to use concurrently (default = 1)
-  --threads                   Enable multithreading to drive engines with independent threads (default = disabled)
-  --useCudaGraph              Use cuda graph to capture engine execution and then launch inference (default = false)
-  --buildOnly                 Skip inference perf measurement (default = disabled)
-Note: if a batch size is specified only for inference, it will be used also as min, opt, and max batch size for the builder
-
-=== Reporting Options ===
-  --verbose                   Use verbose logging (default = false)
-  --avgRuns=N                 Report performance measurements averaged over N consecutive iterations (default = 10)
-  --percentile=P              Report performance for the P percentage (0<=P<=100, 0 representing max perf, and 100 representing min perf; (default = 99%)
-  --dumpOutput                Print the output tensor(s) of the last inference iteration (default = disabled)
-  --dumpProfile               Print profile information per layer (default = disabled)
-  --exportTimes=<file>        Write the timing results in a json file (default = disabled)
-  --exportProfile=<file>      Write the profile information per layer in a json file (default = disabled)
-
-=== System Options ===
-  --device=N                  Select cuda device N (default = 0)
-  --useDLACore=N              Select DLA core N for layers that support DLA (default = none)
-  --allowGPUFallback          When DLA is enabled, allow GPU fallback for unsupported layers (default = disabled)
-  --plugins                   Plugin library (.so) to load (can be specified multiple times)
-
-=== Help ===
-  --help                      Print this message
-&&&& PASSED TensorRT.trtexec # ./trtexec --help
-
-```
+To see the full list of available options and their descriptions, issue the `./trtexec --help` command.
 
 **Note:** Specifying the `--safe` parameter turns the safety mode switch `ON`. By default, the `--safe` parameter is not specified; the safety mode switch is `OFF`. The layers and parameters that are contained within the `--safe` subset are restricted if the switch is set to `ON`. The switch is used for prototyping the safety restricted flows until the TensorRT safety runtime is made available. For more information, see the [Working With Automotive Safety section in the TensorRT Developer Guide](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#working_auto_safety).
 
diff --git a/samples/opensource/trtexec/trtexec.cpp b/samples/opensource/trtexec/trtexec.cpp
index d1e55b5c..fa6020cf 100644
--- a/samples/opensource/trtexec/trtexec.cpp
+++ b/samples/opensource/trtexec/trtexec.cpp
@@ -13,7 +13,9 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 #include <algorithm>
+#include <cctype>
 #include <chrono>
 #include <cmath>
 #include <cuda_runtime_api.h>
@@ -21,8 +23,6 @@
 #include <functional>
 #include <iostream>
 #include <iterator>
-#include <map>
-#include <random>
 #include <sstream>
 #include <string.h>
 #include <sys/stat.h>
@@ -38,66 +38,11 @@
 #include "buffers.h"
 #include "common.h"
 #include "logger.h"
+#include "sampleOptions.h"
+#include "sampleEngines.h"
 
 using namespace nvinfer1;
-using namespace nvcaffeparser1;
-using namespace nvuffparser;
-using namespace nvonnxparser;
-
-const std::string gSampleName = "TensorRT.trtexec";
-
-struct Params
-{
-    std::string deployFile{};
-    std::string modelFile{};
-    std::string engine{};
-    std::string saveEngine{};
-    std::string loadEngine{};
-    std::string calibrationCache{"CalibrationTable"};
-    std::string outputCalibrationCache{"CalibrationTable"};
-    std::string uffFile{};
-    std::string onnxModelFile{};
-    std::vector<std::string> inputs{};
-    std::vector<std::string> outputs{};
-    std::vector<std::pair<std::string, Dims3>> uffInputs{};
-    int device{0};
-    int batchSize{1};
-    int workspaceSize{16};
-    int iterations{10};
-    int avgRuns{10};
-    int useDLACore{-1};
-    bool safeMode{false};
-    bool fp16{false};
-    bool int8{false};
-    bool verbose{false};
-    bool allowGPUFallback{false};
-    float pct{99};
-    bool useSpinWait{false};
-    bool dumpOutput{false};
-    bool dumpLayerTime{false};
-    bool help{false};
-    std::vector<std::string> plugins;
-} gParams;
-
-inline int volume(Dims dims)
-{
-    return std::accumulate(dims.d, dims.d + dims.nbDims, 1, std::multiplies<int>());
-}
-
-std::map<std::string, Dims3> gInputDimensions;
-
-std::vector<std::string> split(const std::string& s, char delim)
-{
-    std::vector<std::string> res;
-    std::stringstream ss;
-    ss.str(s);
-    std::string item;
-    while (std::getline(ss, item, delim))
-    {
-        res.push_back(item);
-    }
-    return res;
-}
+using namespace sample;
 
 float percentile(float percentage, std::vector<float>& times)
 {
@@ -111,304 +56,67 @@ float percentile(float percentage, std::vector<float>& times)
     return std::numeric_limits<float>::infinity();
 }
 
-class RndInt8Calibrator : public IInt8EntropyCalibrator2
+bool doInference(ICudaEngine& engine, const InferenceOptions& inference, const ReportingOptions& reporting)
 {
-public:
-    RndInt8Calibrator(int totalSamples, std::string cacheFile, std::string outputCacheFile)
-        : mTotalSamples(totalSamples)
-        , mCurrentSample(0)
-        , mCacheFile(cacheFile)
-	, mOutputCacheFile(outputCacheFile)
-    {
-        std::default_random_engine generator;
-        std::uniform_real_distribution<float> distribution(-1.0F, 1.0F);
-        for (auto& elem : gInputDimensions)
-        {
-            int elemCount = volume(elem.second);
-
-            std::vector<float> rnd_data(elemCount);
-            for (auto& val : rnd_data)
-                val = distribution(generator);
-
-            void* data;
-            CHECK(cudaMalloc(&data, elemCount * sizeof(float)));
-            CHECK(cudaMemcpy(data, &rnd_data[0], elemCount * sizeof(float), cudaMemcpyHostToDevice));
-
-            mInputDeviceBuffers.insert(std::make_pair(elem.first, data));
-        }
-    }
-
-    ~RndInt8Calibrator()
-    {
-        for (auto& elem : mInputDeviceBuffers)
-            CHECK(cudaFree(elem.second));
-    }
-
-    int getBatchSize() const override
-    {
-        return 1;
-    }
-
-    bool getBatch(void* bindings[], const char* names[], int nbBindings) override
-    {
-        if (mCurrentSample >= mTotalSamples)
-            return false;
-
-        for (int i = 0; i < nbBindings; ++i)
-            bindings[i] = mInputDeviceBuffers[names[i]];
-
-        ++mCurrentSample;
-        return true;
-    }
-
-    const void* readCalibrationCache(size_t& length) override
-    {
-        mCalibrationCache.clear();
-        std::ifstream input(mCacheFile, std::ios::binary);
-        input >> std::noskipws;
-        if (input.good())
-            std::copy(std::istream_iterator<char>(input), std::istream_iterator<char>(),
-                std::back_inserter(mCalibrationCache));
-
-        length = mCalibrationCache.size();
-        return length ? &mCalibrationCache[0] : nullptr;
-    }
-
-    void writeCalibrationCache(const void* cache, size_t length) override
-    {
-        std::ofstream output(mOutputCacheFile, std::ios::binary);
-        output.write(reinterpret_cast<const char*>(cache), length);
-    }
-
-private:
-    int mTotalSamples;
-    int mCurrentSample;
-    std::string mCacheFile;
-    std::string mOutputCacheFile;
-    std::map<std::string, void*> mInputDeviceBuffers;
-    std::vector<char> mCalibrationCache;
-};
-
-void configureBuilder(IBuilder* builder, RndInt8Calibrator& calibrator)
-{
-    builder->setMaxBatchSize(gParams.batchSize);
-    builder->setMaxWorkspaceSize(static_cast<size_t>(gParams.workspaceSize) << 20);
-    builder->setFp16Mode(gParams.fp16);
-    if (gParams.int8)
-    {
-        builder->setInt8Mode(true);
-        builder->setInt8Calibrator(&calibrator);
-    }
-
-    if (gParams.safeMode)
-    {
-        builder->setEngineCapability(
-            gParams.useDLACore >= 0 ? EngineCapability::kSAFE_DLA : EngineCapability::kSAFE_GPU);
-    }
-}
-
-ICudaEngine* caffeToTRTModel()
-{
-    // create the builder
-    IBuilder* builder = createInferBuilder(gLogger.getTRTLogger());
-    if (builder == nullptr)
-    {
-        return nullptr;
-    }
-
-    // parse the caffe model to populate the network, then set the outputs
-    INetworkDefinition* network = builder->createNetwork();
-    ICaffeParser* parser = createCaffeParser();
-    const IBlobNameToTensor* blobNameToTensor = parser->parse(gParams.deployFile.c_str(),
-        gParams.modelFile.empty() ? 0 : gParams.modelFile.c_str(), *network, DataType::kFLOAT);
-
-    if (!blobNameToTensor)
-    {
-        return nullptr;
-    }
-
-    for (int i = 0, n = network->getNbInputs(); i < n; i++)
-    {
-        Dims3 dims = static_cast<Dims3&&>(network->getInput(i)->getDimensions());
-        gParams.inputs.push_back(network->getInput(i)->getName());
-        gInputDimensions.insert(std::make_pair(network->getInput(i)->getName(), dims));
-        gLogInfo << "Input \"" << network->getInput(i)->getName() << "\": " << dims.d[0] << "x" << dims.d[1] << "x"
-                 << dims.d[2] << std::endl;
-    }
-
-    // specify which tensors are outputs
-    for (auto& s : gParams.outputs)
-    {
-        if (blobNameToTensor->find(s.c_str()) == nullptr)
-        {
-            gLogError << "could not find output blob " << s << std::endl;
-            return nullptr;
-        }
-        network->markOutput(*blobNameToTensor->find(s.c_str()));
-    }
-
-    for (int i = 0, n = network->getNbOutputs(); i < n; i++)
-    {
-        Dims3 dims = static_cast<Dims3&&>(network->getOutput(i)->getDimensions());
-        gLogInfo << "Output \"" << network->getOutput(i)->getName() << "\": " << dims.d[0] << "x" << dims.d[1] << "x"
-                 << dims.d[2] << std::endl;
-    }
-
-    // Build the engine
-    RndInt8Calibrator calibrator(1, gParams.calibrationCache, gParams.outputCalibrationCache);
-    configureBuilder(builder, calibrator);
-
-    samplesCommon::enableDLA(builder, gParams.useDLACore, gParams.allowGPUFallback);
-
-    ICudaEngine* engine = builder->buildCudaEngine(*network);
-    if (engine == nullptr)
-    {
-        gLogError << "could not build engine" << std::endl;
-    }
-
-    parser->destroy();
-    network->destroy();
-    builder->destroy();
-    return engine;
-}
+    IExecutionContext* context = engine.createExecutionContext();
 
-ICudaEngine* uffToTRTModel()
-{
-    // create the builder
-    IBuilder* builder = createInferBuilder(gLogger.getTRTLogger());
-    if (builder == nullptr)
+    // Dump inferencing time per layer basis
+    SimpleProfiler profiler("Layer time");
+    if (reporting.profile)
     {
-        return nullptr;
+        context->setProfiler(&profiler);
     }
 
-    // parse the caffe model to populate the network, then set the outputs
-    INetworkDefinition* network = builder->createNetwork();
-    IUffParser* parser = createUffParser();
-
-    // specify which tensors are outputs
-    for (auto& s : gParams.outputs)
+    for (int b = 0; b < engine.getNbBindings(); ++b)
     {
-        if (!parser->registerOutput(s.c_str()))
+        if (!engine.bindingIsInput(b))
         {
-            gLogError << "Failed to register output " << s << std::endl;
-            return nullptr;
+            continue;
         }
-    }
-
-    // specify which tensors are inputs (and their dimensions)
-    for (auto& s : gParams.uffInputs)
-    {
-        if (!parser->registerInput(s.first.c_str(), s.second, UffInputOrder::kNCHW))
+        auto dims = context->getBindingDimensions(b);
+        if (dims.d[0] == -1)
         {
-            gLogError << "Failed to register input " << s.first << std::endl;
-            return nullptr;
+            auto shape = inference.shapes.find(engine.getBindingName(b));
+            if (shape == inference.shapes.end())
+            {
+                gLogError << "Missing dynamic batch size in inference" << std::endl;
+                return false;
+            }
+            dims.d[0] = shape->second.d[0];
+            context->setBindingDimensions(b, dims);
         }
     }
 
-    if (!parser->parse(gParams.uffFile.c_str(), *network))
-        return nullptr;
-
-    for (int i = 0, n = network->getNbInputs(); i < n; i++)
-    {
-        Dims3 dims = static_cast<Dims3&&>(network->getInput(i)->getDimensions());
-        gParams.inputs.push_back(network->getInput(i)->getName());
-        gInputDimensions.insert(std::make_pair(network->getInput(i)->getName(), dims));
-    }
-
-    // Build the engine
-    RndInt8Calibrator calibrator(1, gParams.calibrationCache, gParams.outputCalibrationCache);
-    configureBuilder(builder, calibrator);
-
-    samplesCommon::enableDLA(builder, gParams.useDLACore);
-
-    ICudaEngine* engine = builder->buildCudaEngine(*network);
-    if (engine == nullptr)
-        gLogError << "could not build engine" << std::endl;
-
-    parser->destroy();
-    network->destroy();
-    builder->destroy();
-    return engine;
-}
-
-ICudaEngine* onnxToTRTModel()
-{
-    // create the builder
-    IBuilder* builder = createInferBuilder(gLogger.getTRTLogger());
-    if (builder == nullptr)
-    {
-        return nullptr;
-    }
-    nvinfer1::INetworkDefinition* network = builder->createNetwork();
-
-    // parse the onnx model to populate the network, then set the outputs
-    IParser* parser = nvonnxparser::createParser(*network, gLogger.getTRTLogger());
-    if (!parser->parseFromFile(gParams.onnxModelFile.c_str(), static_cast<int>(gLogger.getReportableSeverity())))
-    {
-        gLogError << "failed to parse onnx file" << std::endl;
-        return nullptr;
-    }
-
-    for (int i = 0, n = network->getNbInputs(); i < n; i++)
-    {
-        Dims3 dims = static_cast<Dims3&&>(network->getInput(i)->getDimensions());
-        gInputDimensions.insert(std::make_pair(network->getInput(i)->getName(), dims));
-    }
-
-    // Build the engine
-    RndInt8Calibrator calibrator(1, gParams.calibrationCache, gParams.outputCalibrationCache);
-    configureBuilder(builder, calibrator);
-
-    samplesCommon::enableDLA(builder, gParams.useDLACore);
-
-    ICudaEngine* engine = builder->buildCudaEngine(*network);
-
-    if (engine == nullptr)
-    {
-        gLogError << "could not build engine" << std::endl;
-    }
-
-    parser->destroy();
-    network->destroy();
-    builder->destroy();
-    return engine;
-}
-
-void doInference(ICudaEngine& engine)
-{
-    IExecutionContext* context = engine.createExecutionContext();
-
-    // Dump inferencing time per layer basis
-    SimpleProfiler profiler("Layer time");
-    if (gParams.dumpLayerTime)
-    {
-        context->setProfiler(&profiler);
-    }
-
     // Use an aliasing shared_ptr since we don't want engine to be deleted when bufferManager goes out of scope.
     std::shared_ptr<ICudaEngine> emptyPtr{};
     std::shared_ptr<ICudaEngine> aliasPtr(emptyPtr, &engine);
-    samplesCommon::BufferManager bufferManager(aliasPtr, gParams.batchSize);
+    samplesCommon::BufferManager bufferManager(aliasPtr, inference.batch, inference.batch ? nullptr : context);
     std::vector<void*> buffers = bufferManager.getDeviceBindings();
 
     cudaStream_t stream;
     CHECK(cudaStreamCreate(&stream));
     cudaEvent_t start, end;
-    unsigned int cudaEventFlags = gParams.useSpinWait ? cudaEventDefault : cudaEventBlockingSync;
-    CHECK(cudaEventCreateWithFlags(&start, cudaEventFlags));
-    CHECK(cudaEventCreateWithFlags(&end, cudaEventFlags));
+    CHECK(cudaEventCreate(&start));
+    CHECK(cudaEventCreate(&end));
 
-    std::vector<float> times(gParams.avgRuns);
-    for (int j = 0; j < gParams.iterations; j++)
+    std::vector<float> times(reporting.avgs);
+    for (int j = 0; j < inference.iterations; j++)
     {
         float totalGpu{0};  // GPU timer
         float totalHost{0}; // Host timer
 
-        for (int i = 0; i < gParams.avgRuns; i++)
+        for (int i = 0; i < reporting.avgs; i++)
         {
             auto tStart = std::chrono::high_resolution_clock::now();
             cudaEventRecord(start, stream);
-            context->enqueue(gParams.batchSize, &buffers[0], stream, nullptr);
+            if (inference.batch)
+            {
+                context->enqueue(inference.batch, &buffers[0], stream, nullptr);
+            }
+            else
+            {
+                context->enqueueV2(&buffers[0], stream, nullptr);
+            }
             cudaEventRecord(end, stream);
             cudaEventSynchronize(end);
 
@@ -420,14 +128,14 @@ void doInference(ICudaEngine& engine)
             totalGpu += ms;
         }
 
-        totalGpu /= gParams.avgRuns;
-        totalHost /= gParams.avgRuns;
-        gLogInfo << "Average over " << gParams.avgRuns << " runs is " << totalGpu << " ms (host walltime is "
-                 << totalHost << " ms, " << static_cast<int>(gParams.pct) << "\% percentile time is "
-                 << percentile(gParams.pct, times) << ")." << std::endl;
+        totalGpu /= reporting.avgs;
+        totalHost /= reporting.avgs;
+        gLogInfo << "Average over " << reporting.avgs << " runs is " << totalGpu << " ms (host walltime is "
+                 << totalHost << " ms, " << static_cast<int>(reporting.percentile) << "\% percentile time is "
+                 << percentile(reporting.percentile, times) << ")." << std::endl;
     }
 
-    if (gParams.dumpOutput)
+    if (reporting.output)
     {
         bufferManager.copyOutputToHost();
         int nbBindings = engine.getNbBindings();
@@ -442,7 +150,7 @@ void doInference(ICudaEngine& engine)
         }
     }
 
-    if (gParams.dumpLayerTime)
+    if (reporting.profile)
     {
         gLogInfo << profiler;
     }
@@ -451,398 +159,118 @@ void doInference(ICudaEngine& engine)
     cudaEventDestroy(start);
     cudaEventDestroy(end);
     context->destroy();
-}
-
-static void printUsage()
-{
-    printf("\n");
-    printf("Mandatory params:\n");
-    printf("  --deploy=<file>          Caffe deploy file\n");
-    printf("  OR --uff=<file>          UFF file\n");
-    printf("  OR --onnx=<file>         ONNX Model file\n");
-    printf("  OR --loadEngine=<file>   Load a saved engine\n");
-
-    printf("\nMandatory params for UFF:\n");
-    printf(
-        "  --uffInput=<name>,C,H,W Input blob name and its dimensions for UFF parser (can be specified multiple "
-        "times)\n");
-    printf("  --output=<name>      Output blob name (can be specified multiple times)\n");
-
-    printf("\nMandatory params for Caffe:\n");
-    printf("  --output=<name>      Output blob name (can be specified multiple times)\n");
-
-    printf("\nOptional params:\n");
-    printf("  --model=<file>          Caffe model file (default = no model, random weights used)\n");
-    printf("  --batch=N               Set batch size (default = %d)\n", gParams.batchSize);
-    printf("  --device=N              Set cuda device to N (default = %d)\n", gParams.device);
-    printf("  --iterations=N          Run N iterations (default = %d)\n", gParams.iterations);
-    printf("  --avgRuns=N             Set avgRuns to N - perf is measured as an average of avgRuns (default=%d)\n",
-        gParams.avgRuns);
-    printf(
-        "  --percentile=P          For each iteration, report the percentile time at P percentage (0<=P<=100, with 0 "
-        "representing min, and 100 representing max; default = %.1f%%)\n",
-        gParams.pct);
-    printf("  --workspace=N           Set workspace size in megabytes (default = %d)\n", gParams.workspaceSize);
-    printf("  --safe                  Only test the functionality available in safety restricted flows.\n");
-    printf("  --fp16                  Run in fp16 mode (default = false). Permits 16-bit kernels\n");
-    printf("  --int8                  Run in int8 mode (default = false). Currently no support for ONNX model.\n");
-    printf("  --verbose               Use verbose logging (default = false)\n");
-    printf("  --saveEngine=<file>     Save a serialized engine to file.\n");
-    printf("  --loadEngine=<file>     Load a serialized engine from file.\n");
-    printf("  --plugins=<file>        Load a TensorRT custom plugin.\n");
-    printf("  --calib=<file>          Read INT8 calibration cache file.  Currently no support for ONNX model.\n");
-    printf("  --calibOut=<file>       Write INT8 calibration cache file.  Currently no support for ONNX model.\n");
-    printf(
-        "  --useDLACore=N          Specify a DLA engine for layers that support DLA. Value can range from 0 to n-1, "
-        "where n is the number of DLA engines on the platform.\n");
-    printf(
-        "  --allowGPUFallback      If --useDLACore flag is present and if a layer can't run on DLA, then run on GPU. "
-        "\n");
-    printf(
-        "  --useSpinWait           Actively wait for work completion. This option may decrease multi-process "
-        "synchronization time at the cost of additional CPU usage. (default = false)\n");
-    printf("  --dumpOutput            Dump outputs at end of test. \n");
-    printf("  --dumpLayerTime         Dump inferencing time of each layer at end of test. \n");
-    printf("  -h, --help              Print usage\n");
-    fflush(stdout);
-}
-
-bool parseString(const char* arg, const char* name, std::string& value)
-{
-    size_t n = strlen(name);
-    bool match = arg[0] == '-' && arg[1] == '-' && !strncmp(arg + 2, name, n) && arg[n + 2] == '=';
-    if (match)
-    {
-        value = arg + n + 3;
-        gLogInfo << name << ": " << value << std::endl;
-    }
-    return match;
-}
-
-template <typename T>
-bool parseAtoi(const char* arg, const char* name, T& value)
-{
-    size_t n = strlen(name);
-    bool match = arg[0] == '-' && arg[1] == '-' && !strncmp(arg + 2, name, n) && arg[n + 2] == '=';
-    if (match)
-    {
-        value = static_cast<T>(atoi(arg + n + 3));
-        gLogInfo << name << ": " << value << std::endl;
-    }
-    return match;
-}
-
-bool parseInt(const char* arg, const char* name, int& value)
-{
-    return parseAtoi<int>(arg, name, value);
-}
-
-bool parseUnsigned(const char* arg, const char* name, unsigned int& value)
-{
-    return parseAtoi<unsigned int>(arg, name, value);
-}
-
-// parse a boolean option of the form --name, or optionally, -letter.
-bool parseBool(const char* arg, const char* name, bool& value, char letter = '\0')
-{
-    bool match
-        = arg[0] == '-' && ((arg[1] == '-' && !strcmp(arg + 2, name)) || (letter && arg[1] == letter && !arg[2]));
-    if (match)
-    {
-        // Always report the long form of the option.
-        gLogInfo << name << std::endl;
-        value = true;
-    }
-    return match;
-}
 
-bool parseFloat(const char* arg, const char* name, float& value)
-{
-    size_t n = strlen(name);
-    bool match = arg[0] == '-' && arg[1] == '-' && !strncmp(arg + 2, name, n) && arg[n + 2] == '=';
-    if (match)
-    {
-        value = atof(arg + n + 3);
-        gLogInfo << name << ": " << value << std::endl;
-    }
-    return match;
-}
-
-bool validateArgs()
-{
-    // UFF and Caffe files require output nodes to be specified.
-    if ((!gParams.uffFile.empty() || !gParams.deployFile.empty()) && gParams.outputs.empty())
-    {
-        gLogError << "ERROR: At least one output must be specified." << std::endl;
-        return false;
-    }
-    if (!gParams.uffFile.empty() && gParams.uffInputs.empty())
-    {
-        gLogError << "ERROR: At least one UFF input must be specified to run UFF models." << std::endl;
-        return false;
-    }
-    if (!gParams.loadEngine.empty() && !gParams.saveEngine.empty())
-    {
-        gLogError << "ERROR: --saveEngine and --loadEngine cannot be specified at the same time." << std::endl;
-        return false;
-    }
     return true;
 }
 
-bool parseArgs(int argc, char* argv[])
+int main(int argc, char** argv)
 {
-    if (argc < 2)
-    {
-        printUsage();
-        return false;
-    }
-
-    for (int j = 1; j < argc; j++)
-    {
-        if (parseString(argv[j], "model", gParams.modelFile) || parseString(argv[j], "deploy", gParams.deployFile))
-        {
-            continue;
-        }
-        if (parseString(argv[j], "saveEngine", gParams.saveEngine))
-        {
-            continue;
-        }
-        if (parseString(argv[j], "loadEngine", gParams.loadEngine))
-        {
-            continue;
-        }
-        if (parseString(argv[j], "engine", gParams.engine))
-        {
-            gLogError << "--engine has been deprecated. Please use --saveEngine and --loadEngine instead." << std::endl;
-            return false;
-        }
-        if (parseString(argv[j], "uff", gParams.uffFile))
-        {
-            continue;
-        }
-
-        if (parseString(argv[j], "onnx", gParams.onnxModelFile))
-        {
-            continue;
-        }
-
-        if (parseString(argv[j], "calib", gParams.calibrationCache))
-            continue;
-
-        if (parseString(argv[j], "calibOut", gParams.outputCalibrationCache))
-            continue;
-
-        std::string input;
-        if (parseString(argv[j], "input", input))
-        {
-            gLogWarning << "--input has been deprecated and ignored." << std::endl;
-            continue;
-        }
-
-        std::string output;
-        if (parseString(argv[j], "output", output))
-        {
-            gParams.outputs.push_back(output);
-            continue;
-        }
-
-        std::string uffInput;
-        if (parseString(argv[j], "uffInput", uffInput))
-        {
-            std::vector<std::string> uffInputStrs = split(uffInput, ',');
-            if (uffInputStrs.size() != 4)
-            {
-                gLogError << "Invalid uffInput: " << uffInput << std::endl;
-                return false;
-            }
-
-            gParams.uffInputs.push_back(std::make_pair(uffInputStrs[0],
-                Dims3(atoi(uffInputStrs[1].c_str()), atoi(uffInputStrs[2].c_str()), atoi(uffInputStrs[3].c_str()))));
-            continue;
-        }
-
-        if (parseInt(argv[j], "batch", gParams.batchSize) || parseInt(argv[j], "iterations", gParams.iterations)
-            || parseInt(argv[j], "avgRuns", gParams.avgRuns) || parseInt(argv[j], "device", gParams.device)
-            || parseInt(argv[j], "workspace", gParams.workspaceSize)
-            || parseInt(argv[j], "useDLACore", gParams.useDLACore))
-            continue;
+    const std::string sampleName = "TensorRT.trtexec";
+    auto sampleTest = gLogger.defineTest(sampleName, argc, argv);
 
-        if (parseFloat(argv[j], "percentile", gParams.pct))
-            continue;
-
-        std::string plugin;
-        if (parseString(argv[j], "plugins", plugin))
-        {
-            gParams.plugins.push_back(plugin);
-            continue;
-        }
-
-        if (parseBool(argv[j], "safe", gParams.safeMode) || parseBool(argv[j], "fp16", gParams.fp16)
-            || parseBool(argv[j], "int8", gParams.int8) || parseBool(argv[j], "verbose", gParams.verbose)
-            || parseBool(argv[j], "allowGPUFallback", gParams.allowGPUFallback)
-            || parseBool(argv[j], "useSpinWait", gParams.useSpinWait)
-            || parseBool(argv[j], "dumpOutput", gParams.dumpOutput)
-            || parseBool(argv[j], "dumpLayerTime", gParams.dumpLayerTime)
-            || parseBool(argv[j], "help", gParams.help, 'h'))
-            continue;
-
-        gLogError << "Unknown argument: " << argv[j] << std::endl;
-        return false;
-    }
-
-    return validateArgs();
-}
+    gLogger.reportTestStart(sampleTest);
 
-static ICudaEngine* createEngine()
-{
-    ICudaEngine* engine{nullptr};
+    Arguments args = argsToArgumentsMap(argc, argv);
+    AllOptions options;
 
-    // Load serialized engine file if specified by user
-    if (!gParams.loadEngine.empty())
+    if (!args.empty())
     {
-        std::vector<char> engineData;
-        size_t fsize{0};
-
+        bool failed{false};
+        try
         {
-            // Open engine file
-            std::ifstream engineFile(gParams.loadEngine, std::ios::binary);
-            if (!engineFile.good())
+            options.parse(args);
+
+            if (!args.empty())
             {
-                gLogInfo << "Error loading engine file: " << gParams.loadEngine << std::endl;
-                return engine;
+                for (const auto& arg : args)
+                {
+                    gLogError << "Unknown option: " << arg.first << " " << arg.second << std::endl;
+                }
+                failed = true;
             }
-
-            // Read engine file to memory
-            engineFile.seekg(0, engineFile.end);
-            fsize = engineFile.tellg();
-            engineFile.seekg(0, engineFile.beg);
-            engineData.resize(fsize);
-            engineFile.read(engineData.data(), fsize);
-            engineFile.close();
         }
-
-        // Create runtime
-        IRuntime* runtime = createInferRuntime(gLogger.getTRTLogger());
-        if (gParams.useDLACore >= 0)
+        catch (const std::invalid_argument& arg)
         {
-            runtime->setDLACore(gParams.useDLACore);
+            gLogError << arg.what() << std::endl;
+            failed = true;
         }
 
-        // Create engine
-        engine = runtime->deserializeCudaEngine(engineData.data(), fsize, nullptr);
-        gLogInfo << gParams.loadEngine << " has been successfully loaded." << std::endl;
-
-        runtime->destroy();
-        return engine;
-    }
-
-    // User has not provided an engine file
-    if ((!gParams.deployFile.empty()) || (!gParams.uffFile.empty()) || (!gParams.onnxModelFile.empty()))
-    {
-        if (!gParams.uffFile.empty())
-        {
-            engine = uffToTRTModel();
-        }
-        else if (!gParams.onnxModelFile.empty())
-        {
-            engine = onnxToTRTModel();
-        }
-        else
+        if (failed)
         {
-            engine = caffeToTRTModel();
-        }
-
-        if (!engine)
-        {
-            gLogError << "Engine could not be created" << std::endl;
-            return nullptr;
-        }
-
-        // User wants to save engine to file
-        if (!gParams.saveEngine.empty())
-        {
-            // Open output file
-            std::ofstream engineFile(gParams.saveEngine, std::ios::binary);
-            if (!engineFile)
-            {
-                gLogError << "Could not open output engine file: " << gParams.saveEngine << std::endl;
-                return nullptr;
-            }
-
-            IHostMemory* serializedEngine = engine->serialize();
-            if (serializedEngine == nullptr)
-            {
-                gLogError << "Could not serialize engine." << std::endl;
-                return nullptr;
-            }
-
-            engineFile.write(reinterpret_cast<const char*>(serializedEngine->data()), serializedEngine->size());
-            serializedEngine->destroy();
-            gLogInfo << "Engine has been successfully saved to: " << gParams.saveEngine << std::endl;
+            AllOptions::help(std::cout);
+            std::cout << "Note: the following options are not fully supported in trtexec:"
+                         " dynamic shapes, multistream/threads, cuda graphs, json logs,"
+                         " and actual data IO"
+                      << std::endl;
+            return gLogger.reportFail(sampleTest);
         }
-
-        return engine;
     }
-
-    // Complain about empty deploy file
-    gLogError << "Deploy file not specified" << std::endl;
-
-    return nullptr;
-}
-
-int main(int argc, char** argv)
-{
-    // create a TensorRT model from the caffe/uff/onnx model and serialize it to a stream
-
-    auto sampleTest = gLogger.defineTest(gSampleName, argc, argv);
-
-    gLogger.reportTestStart(sampleTest);
-
-    if (!parseArgs(argc, argv))
+    else
     {
-        return gLogger.reportFail(sampleTest);
+        options.helps = true;
     }
 
-    if (gParams.help)
+    if (options.helps)
     {
-        printUsage();
+        AllOptions::help(std::cout);
+        std::cout << "Note: the following options are not fully supported in trtexec:"
+                     " dynamic shapes, multistream/threads, cuda graphs, json logs,"
+                     " and actual data IO"
+                  << std::endl;
         return gLogger.reportPass(sampleTest);
     }
 
-    if (gParams.verbose)
+    gLogInfo << options;
+    if (options.reporting.verbose)
     {
         setReportableSeverity(Severity::kVERBOSE);
     }
 
-    cudaSetDevice(gParams.device);
+    cudaSetDevice(options.system.device);
 
     initLibNvInferPlugins(&gLogger.getTRTLogger(), "");
 
-    for (const auto& plugin : gParams.plugins)
+    for (const auto& pluginPath : options.system.plugins)
     {
-	if (EXIT_SUCCESS != loadLibrary(plugin))
-        {
-            return gLogger.reportFail(sampleTest);
-        }
+        gLogInfo << "Loading supplied plugin library: " << pluginPath << std::endl;
+        samplesCommon::loadLibrary(pluginPath);
     }
 
-    ICudaEngine* engine = createEngine();
+    ICudaEngine* engine{nullptr};
+    if (options.build.load)
+    {
+        engine = loadEngine(options.build.engine, options.system.DLACore, gLogError);
+    }
+    else
+    {
+        engine = modelToEngine(options.model, options.build, options.system, gLogError);
+    }
     if (!engine)
     {
         gLogError << "Engine could not be created" << std::endl;
         return gLogger.reportFail(sampleTest);
     }
-
-    if (gParams.uffFile.empty() && gParams.onnxModelFile.empty())
+    if (options.build.save)
     {
-        nvcaffeparser1::shutdownProtobufLibrary();
+        saveEngine(*engine, options.build.engine, gLogError);
     }
-    else if (gParams.deployFile.empty() && gParams.onnxModelFile.empty())
+
+    if (!options.inference.skip)
     {
-        nvuffparser::shutdownProtobufLibrary();
+        if (options.build.safe && options.system.DLACore >= 0)
+        {
+            gLogInfo << "Safe DLA capability is detected. Please save DLA loadable with --saveEngine option, "
+                        "then use dla_safety_runtime to run inference with saved DLA loadable, "
+                        "or alternatively run with your own application"
+                     << std::endl;
+            return gLogger.reportFail(sampleTest);
+        }
+        if (!doInference(*engine, options.inference, options.reporting))
+        {
+            gLogError << "Inference failure" << std::endl;
+            return gLogger.reportFail(sampleTest);
+        }
     }
-
-    doInference(*engine);
     engine->destroy();
 
     return gLogger.reportPass(sampleTest);
diff --git a/third_party/protobuf.cmake b/third_party/protobuf.cmake
index 6a5b8f60..8a2c7c0c 100644
--- a/third_party/protobuf.cmake
+++ b/third_party/protobuf.cmake
@@ -21,11 +21,11 @@ include(ExternalProject)
 macro(configure_protobuf VERSION)
     set(protobufPackage "protobuf-cpp-${VERSION}.tar.gz")
     set(Protobuf_PKG_URL "https://github.com/google/protobuf/releases/download/v${VERSION}/${protobufPackage}")
-    set(Protobuf_INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/${PROTOBUF_TARGET})
+    set(Protobuf_INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR})
     set(Protobuf_TARGET third_party.protobuf)
 
-    set(PROTOBUF_CFLAGS -Dgoogle=google_private)
-    set(PROTOBUF_CXXFLAGS -Dgoogle=google_private)
+    set(PROTOBUF_CFLAGS "-Dgoogle=google_private")
+    set(PROTOBUF_CXXFLAGS "-Dgoogle=google_private")
 
     ExternalProject_Add(${Protobuf_TARGET}
         PREFIX ${Protobuf_TARGET}
@@ -33,7 +33,6 @@ macro(configure_protobuf VERSION)
         UPDATE_COMMAND ""
         CONFIGURE_COMMAND ${CMAKE_COMMAND} ${Protobuf_INSTALL_DIR}/${Protobuf_TARGET}/src/${Protobuf_TARGET}/cmake
             -G${CMAKE_GENERATOR}
-            -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}
             -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
             -DCMAKE_POSITION_INDEPENDENT_CODE=ON
             -DCMAKE_C_COMPILER:FILEPATH=/usr/bin/gcc
@@ -65,10 +64,9 @@ macro(configure_protobuf VERSION)
         set(Protobuf_PROTOC_LIBRARY "${Protobuf_LIB_DIR}/libprotoc.a")
         set(Protobuf_LITE_LIBRARY "${Protobuf_LIB_DIR}/libprotobuf-lite.a")
     endif()
-    set(Protobuf_INSTALL_DIR "${CMAKE_BINARY_DIR}/${Protobuf_TARGET}")
     set(protolibType STATIC)
 
-    if (NOT(${CMAKE_SYSTEM_NAME} STREQUAL "Linux") AND NOT(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86_64"))
+    if ((${CMAKE_SYSTEM_NAME} STREQUAL "Linux") AND NOT(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86_64"))
         message(STATUS "Setting up another Protobuf build for cross compilation targeting ${CMAKE_SYSTEM_PROCESSOR}-${CMAKE_SYSTEM_NAME}")
         # In case of cross-compilation for QNX requires additional CXX flags
         if(${CMAKE_SYSTEM_NAME} STREQUAL "qnx")
@@ -77,24 +75,23 @@ macro(configure_protobuf VERSION)
         endif()
         ExternalProject_Add(${Protobuf_TARGET}_${CMAKE_SYSTEM_PROCESSOR}
             PREFIX ${Protobuf_TARGET}_${CMAKE_SYSTEM_PROCESSOR}
-            DOWNLOAD_COMMAND ""
+            URL ${Protobuf_PKG_URL}
             UPDATE_COMMAND ""
-            CONFIGURE_COMMAND ${CMAKE_COMMAND} ${Protobuf_INSTALL_DIR}/${Protobuf_TARGET}/src/${Protobuf_TARGET}/cmake
+            CONFIGURE_COMMAND ${CMAKE_COMMAND} ${Protobuf_INSTALL_DIR}/${Protobuf_TARGET}_${CMAKE_SYSTEM_PROCESSOR}/src/${Protobuf_TARGET}_${CMAKE_SYSTEM_PROCESSOR}/cmake
                 -G${CMAKE_GENERATOR}
-                -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}/${CMAKE_SYSTEM_PROCESSOR}
-                -E env CXXFLAGS="-Dgoogle=google_private"
                 -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
                 -DCMAKE_POSITION_INDEPENDENT_CODE=ON
                 -DCMAKE_C_COMPILER:FILEPATH=${CMAKE_C_COMPILER}
                 -DCMAKE_CXX_COMPILER:FILEPATH=${CMAKE_CXX_COMPILER}
                 -DCMAKE_C_FLAGS=${PROTOBUF_CFLAGS}
                 -DCMAKE_CXX_FLAGS=${PROTOBUF_CXXFLAGS}
-                -DCMAKE_INSTALL_PREFIX=${Protobuf_INSTALL_DIR}/${Protobuf_TARGET}
+                -DCMAKE_INSTALL_PREFIX=${Protobuf_INSTALL_DIR}/${Protobuf_TARGET}_${CMAKE_SYSTEM_PROCESSOR}
                 -Dprotobuf_BUILD_TESTS=OFF
             SOURCE_SUBDIR cmake
+            BINARY_DIR ${Protobuf_INSTALL_DIR}/${Protobuf_TARGET}_${CMAKE_SYSTEM_PROCESSOR}/src/${Protobuf_TARGET}_${CMAKE_SYSTEM_PROCESSOR}/
         )
 
-        set(Protobuf_LIB_DIR "${CMAKE_BINARY_DIR}/${Protobuf_TARGET}/lib")
+        set(Protobuf_LIB_DIR "${CMAKE_BINARY_DIR}/${Protobuf_TARGET}_${CMAKE_SYSTEM_PROCESSOR}/lib")
         set(Protobuf_INCLUDE_DIR "${CMAKE_BINARY_DIR}/${Protobuf_TARGET}/include")
         set(Protobuf_INCLUDE_DIRS "${CMAKE_BINARY_DIR}/${Protobuf_TARGET}/include")
         if (CMAKE_BUILD_TYPE STREQUAL "Debug")
@@ -111,16 +108,21 @@ macro(configure_protobuf VERSION)
     endif()
 
     add_library(protobuf::libprotobuf ${protolibType} IMPORTED)
-    add_dependencies(protobuf::libprotobuf ${Protobuf_TARGET})
     set_target_properties(protobuf::libprotobuf PROPERTIES
         IMPORTED_LOCATION "${Protobuf_LIBRARY}"
     )
 
     add_library(protobuf::libprotobuf-lite ${protolibType} IMPORTED)
-    add_dependencies(protobuf::libprotobuf-lite ${Protobuf_TARGET})
     set_target_properties(protobuf::libprotobuf-lite PROPERTIES
         IMPORTED_LOCATION "${Protobuf_LITE_LIBRARY}"
     )
+    if ((${CMAKE_SYSTEM_NAME} STREQUAL "Linux") AND NOT(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86_64"))
+        add_dependencies(protobuf::libprotobuf ${Protobuf_TARGET}_${CMAKE_SYSTEM_PROCESSOR})
+        add_dependencies(protobuf::libprotobuf-lite ${Protobuf_TARGET}_${CMAKE_SYSTEM_PROCESSOR})
+    else ()
+        add_dependencies(protobuf::libprotobuf ${Protobuf_TARGET}_${CMAKE_SYSTEM_PROCESSOR})
+        add_dependencies(protobuf::libprotobuf-lite ${Protobuf_TARGET})
+    endif()
 
     add_library(protobuf::libprotoc ${protolibType} IMPORTED)
     add_dependencies(protobuf::libprotoc ${Protobuf_TARGET})
@@ -205,7 +207,7 @@ function(protobuf_generate_cpp SRCS HDRS)
             COMMAND LIBRARY_PATH=${Protobuf_LIB_DIR} ${Protobuf_PROTOC_EXECUTABLE}
             ARGS --cpp_out ${CMAKE_CURRENT_BINARY_DIR}/${PROTO_DIR} -I${CMAKE_CURRENT_SOURCE_DIR}/${PROTO_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/${proto}
             WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}"
-            DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${proto}" protobuf::libprotobuf Protobuf
+            DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${proto}" protobuf::libprotobuf Protobuf protobuf::protoc
             COMMENT "${proto} -> ${PROTO_DIR}/${PROTO_SRC} ${PROTO_DIR}/${PROTO_HEADER}"
         )