diff --git a/.idea/.gitignore b/.idea/.gitignore
new file mode 100644
index 0000000000..359bb5307e
--- /dev/null
+++ b/.idea/.gitignore
@@ -0,0 +1,3 @@
+# 默认忽略的文件
+/shelf/
+/workspace.xml
diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml
new file mode 100644
index 0000000000..28c863b6ff
--- /dev/null
+++ b/.idea/inspectionProfiles/Project_Default.xml
@@ -0,0 +1,46 @@
+<component name="InspectionProjectProfileManager">
+  <profile version="1.0">
+    <option name="myName" value="Project Default" />
+    <inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
+      <option name="ignoredPackages">
+        <value>
+          <list size="18">
+            <item index="0" class="java.lang.String" itemvalue="onnxruntime" />
+            <item index="1" class="java.lang.String" itemvalue="onnx-simplifier" />
+            <item index="2" class="java.lang.String" itemvalue="scipy" />
+            <item index="3" class="java.lang.String" itemvalue="thop" />
+            <item index="4" class="java.lang.String" itemvalue="opencv-python" />
+            <item index="5" class="java.lang.String" itemvalue="torch" />
+            <item index="6" class="java.lang.String" itemvalue="numpy" />
+            <item index="7" class="java.lang.String" itemvalue="torchvision" />
+            <item index="8" class="java.lang.String" itemvalue="tqdm" />
+            <item index="9" class="java.lang.String" itemvalue="pandas" />
+            <item index="10" class="java.lang.String" itemvalue="tensorboard" />
+            <item index="11" class="java.lang.String" itemvalue="seaborn" />
+            <item index="12" class="java.lang.String" itemvalue="matplotlib" />
+            <item index="13" class="java.lang.String" itemvalue="Cython" />
+            <item index="14" class="java.lang.String" itemvalue="pycocotools" />
+            <item index="15" class="java.lang.String" itemvalue="h5py" />
+            <item index="16" class="java.lang.String" itemvalue="opencv_python" />
+            <item index="17" class="java.lang.String" itemvalue="Pillow" />
+          </list>
+        </value>
+      </option>
+    </inspection_tool>
+    <inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
+      <option name="ignoredErrors">
+        <list>
+          <option value="N806" />
+          <option value="N801" />
+        </list>
+      </option>
+    </inspection_tool>
+    <inspection_tool class="PyUnresolvedReferencesInspection" enabled="true" level="WARNING" enabled_by_default="true">
+      <option name="ignoredIdentifiers">
+        <list>
+          <option value="tkinter.*" />
+        </list>
+      </option>
+    </inspection_tool>
+  </profile>
+</component>
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
new file mode 100644
index 0000000000..105ce2da2d
--- /dev/null
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@@ -0,0 +1,6 @@
+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000000..d1e22ecb89
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.8" project-jdk-type="Python SDK" />
+</project>
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000000..ecffae6dc6
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/yolov7.iml" filepath="$PROJECT_DIR$/.idea/yolov7.iml" />
+    </modules>
+  </component>
+</project>
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000000..94a25f7f4c
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+  </component>
+</project>
\ No newline at end of file
diff --git a/.idea/yolov7.iml b/.idea/yolov7.iml
new file mode 100644
index 0000000000..8b8c395472
--- /dev/null
+++ b/.idea/yolov7.iml
@@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+  <component name="PyDocumentationSettings">
+    <option name="format" value="PLAIN" />
+    <option name="myDocStringFormat" value="Plain" />
+  </component>
+</module>
\ No newline at end of file
diff --git a/README.md b/README.md
index 4d1f48c9b6..1892b1b067 100644
--- a/README.md
+++ b/README.md
@@ -151,6 +151,13 @@ python detect.py --weights yolov7.pt --conf 0.25 --img-size 640 --source inferen
     </a>
 </div>
 
+
+## Export
+Use the args `--include-nms` can to export end to end onnx model which include the `EfficientNMS`. 
+```shell
+python models/export.py --weights yolov7.pt --grid  --include-nms
+```
+
 ## Citation
 
 ```
diff --git a/export.py b/export.py
index 06dfc942c3..fd4975f6f9 100644
--- a/export.py
+++ b/export.py
@@ -12,6 +12,7 @@
 from utils.activations import Hardswish, SiLU
 from utils.general import set_logging, check_img_size
 from utils.torch_utils import select_device
+from utils.add_nms import RegisterNMS
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
@@ -22,6 +23,7 @@
     parser.add_argument('--grid', action='store_true', help='export Detect() layer grid')
     parser.add_argument('--device', default='cpu', help='cuda device, i.e. 0 or 0,1,2,3 or cpu')
     parser.add_argument('--simplify', action='store_true', help='simplify onnx model')
+    parser.add_argument('--include-nms', action='store_true', help='export end2end onnx')
     opt = parser.parse_args()
     opt.img_size *= 2 if len(opt.img_size) == 1 else 1  # expand
     print(opt)
@@ -52,7 +54,9 @@
         #     m.forward = m.forward_export  # assign forward (optional)
     model.model[-1].export = not opt.grid  # set Detect() layer grid export
     y = model(img)  # dry run
-
+    if opt.include_nms:
+        model.model[-1].include_nms = True
+        y = None
     # TorchScript export
     try:
         print('\nStarting TorchScript export with torch %s...' % torch.__version__)
@@ -75,16 +79,23 @@
                           dynamic_axes={'images': {0: 'batch', 2: 'height', 3: 'width'},  # size(1,3,640,640)
                                         'output': {0: 'batch', 2: 'y', 3: 'x'}} if opt.dynamic else None)
 
-        # Checks
-        onnx_model = onnx.load(f)  # load onnx model
-        onnx.checker.check_model(onnx_model)  # check onnx model
-
-        # # Metadata
-        # d = {'stride': int(max(model.stride))}
-        # for k, v in d.items():
-        #     meta = onnx_model.metadata_props.add()
-        #     meta.key, meta.value = k, str(v)
-        # onnx.save(onnx_model, f)
+        if opt.include_nms:
+            print('Registering NMS plugin...')
+            mo = RegisterNMS(f)
+            mo.register_nms()
+            mo.save(f)
+        else:
+            # Checks
+            onnx_model = onnx.load(f)  # load onnx model
+            onnx.checker.check_model(onnx_model)  # check onnx model
+            # print(onnx.helper.printable_graph(onnx_model.graph))  # print a human readable model
+
+            # # Metadata
+            # d = {'stride': int(max(model.stride))}
+            # for k, v in d.items():
+            #     meta = onnx_model.metadata_props.add()
+            #     meta.key, meta.value = k, str(v)
+            # onnx.save(onnx_model, f)
 
         if opt.simplify:
             try:
@@ -95,11 +106,9 @@
                 assert check, 'assert check failed'
             except Exception as e:
                 print(f'Simplifier failure: {e}')
-        # print(onnx.helper.printable_graph(onnx_model.graph))  # print a human readable model
         print('ONNX export success, saved as %s' % f)
     except Exception as e:
         print('ONNX export failure: %s' % e)
-
     # CoreML export
     try:
         import coremltools as ct
diff --git a/models/common.py b/models/common.py
index 53e3f87193..111af708de 100644
--- a/models/common.py
+++ b/models/common.py
@@ -236,7 +236,7 @@ def forward(self, x):
 class ResX(Res):
     # ResNet bottleneck
     def __init__(self, c1, c2, shortcut=True, g=32, e=0.5):  # ch_in, ch_out, shortcut, groups, expansion
-        super().__init__(c1, c2, shortcu, g, e)
+        super().__init__(c1, c2, shortcut, g, e)
         c_ = int(c2 * e)  # hidden channels
 
 
diff --git a/models/yolo.py b/models/yolo.py
index 5f45aad886..5d2845f1f0 100644
--- a/models/yolo.py
+++ b/models/yolo.py
@@ -5,7 +5,7 @@
 
 sys.path.append('./')  # to run '$ python *.py' files in subdirectories
 logger = logging.getLogger(__name__)
-
+import torch
 from models.common import *
 from models.experimental import *
 from utils.autoanchor import check_anchor_order
@@ -23,7 +23,7 @@
 class Detect(nn.Module):
     stride = None  # strides computed during build
     export = False  # onnx export
-
+    include_nms = False 
     def __init__(self, nc=80, anchors=(), ch=()):  # detection layer
         super(Detect, self).__init__()
         self.nc = nc  # number of classes
@@ -48,7 +48,6 @@ def forward(self, x):
             if not self.training:  # inference
                 if self.grid[i].shape[2:4] != x[i].shape[2:4]:
                     self.grid[i] = self._make_grid(nx, ny).to(x[i].device)
-
                 y = x[i].sigmoid()
                 if not torch.onnx.is_in_onnx_export():
                     y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self.grid[i]) * self.stride[i]  # xy
@@ -59,13 +58,28 @@ def forward(self, x):
                     y = torch.cat((xy, wh, y[..., 4:]), -1)
                 z.append(y.view(bs, -1, self.no))
 
-        return x if self.training else (torch.cat(z, 1), x)
+        if self.include_nms:
+            z = self.convert(z)
+
+        return x if self.training else (z, ) if self.include_nms else (torch.cat(z, 1), x)
 
     @staticmethod
     def _make_grid(nx=20, ny=20):
         yv, xv = torch.meshgrid([torch.arange(ny), torch.arange(nx)])
         return torch.stack((xv, yv), 2).view((1, 1, ny, nx, 2)).float()
 
+    def convert(self, z):
+        z = torch.cat(z, 1)
+        box = z[:, :, :4]
+        conf = z[:, :, 4:5]
+        score = z[:, :, 5:]
+        score *= conf
+        convert_matrix = torch.tensor([[1, 0, 1, 0], [0, 1, 0, 1], [-0.5, 0, 0.5, 0], [0, -0.5, 0, 0.5]],
+                                           dtype=torch.float32,
+                                           device=z.device)
+        box @= convert_matrix                          
+        return (box, score)
+
 
 class IDetect(nn.Module):
     stride = None  # strides computed during build
diff --git a/utils/add_nms.py b/utils/add_nms.py
new file mode 100644
index 0000000000..8cfa23919e
--- /dev/null
+++ b/utils/add_nms.py
@@ -0,0 +1,151 @@
+import numpy as np
+import onnx
+from onnx import shape_inference
+import onnx_graphsurgeon as gs
+import logging
+
+LOGGER = logging.getLogger(__name__)
+
+class RegisterNMS(object):
+    def __init__(
+        self,
+        onnx_model_path: str,
+        precision: str = "fp32",
+    ):
+
+        self.graph = gs.import_onnx(onnx.load(onnx_model_path))
+        assert self.graph
+        LOGGER.info("ONNX graph created successfully")
+        # Fold constants via ONNX-GS that PyTorch2ONNX may have missed
+        self.graph.fold_constants()
+        self.precision = precision
+        self.batch_size = 1
+    def infer(self):
+        """
+        Sanitize the graph by cleaning any unconnected nodes, do a topological resort,
+        and fold constant inputs values. When possible, run shape inference on the
+        ONNX graph to determine tensor shapes.
+        """
+        for _ in range(3):
+            count_before = len(self.graph.nodes)
+
+            self.graph.cleanup().toposort()
+            try:
+                for node in self.graph.nodes:
+                    for o in node.outputs:
+                        o.shape = None
+                model = gs.export_onnx(self.graph)
+                model = shape_inference.infer_shapes(model)
+                self.graph = gs.import_onnx(model)
+            except Exception as e:
+                LOGGER.info(f"Shape inference could not be performed at this time:\n{e}")
+            try:
+                self.graph.fold_constants(fold_shapes=True)
+            except TypeError as e:
+                LOGGER.error(
+                    "This version of ONNX GraphSurgeon does not support folding shapes, "
+                    f"please upgrade your onnx_graphsurgeon module. Error:\n{e}"
+                )
+                raise
+
+            count_after = len(self.graph.nodes)
+            if count_before == count_after:
+                # No new folding occurred in this iteration, so we can stop for now.
+                break
+
+    def save(self, output_path):
+        """
+        Save the ONNX model to the given location.
+        Args:
+            output_path: Path pointing to the location where to write
+                out the updated ONNX model.
+        """
+        self.graph.cleanup().toposort()
+        model = gs.export_onnx(self.graph)
+        onnx.save(model, output_path)
+        LOGGER.info(f"Saved ONNX model to {output_path}")
+
+    def register_nms(
+        self,
+        *,
+        score_thresh: float = 0.25,
+        nms_thresh: float = 0.45,
+        detections_per_img: int = 100,
+    ):
+        """
+        Register the ``EfficientNMS_TRT`` plugin node.
+        NMS expects these shapes for its input tensors:
+            - box_net: [batch_size, number_boxes, 4]
+            - class_net: [batch_size, number_boxes, number_labels]
+        Args:
+            score_thresh (float): The scalar threshold for score (low scoring boxes are removed).
+            nms_thresh (float): The scalar threshold for IOU (new boxes that have high IOU
+                overlap with previously selected boxes are removed).
+            detections_per_img (int): Number of best detections to keep after NMS.
+        """
+
+        self.infer()
+        # Find the concat node at the end of the network
+        op_inputs = self.graph.outputs
+        op = "EfficientNMS_TRT"
+        attrs = {
+            "plugin_version": "1",
+            "background_class": -1,  # no background class
+            "max_output_boxes": detections_per_img,
+            "score_threshold": score_thresh,
+            "iou_threshold": nms_thresh,
+            "score_activation": False,
+            "box_coding": 0,
+        }
+
+        if self.precision == "fp32":
+            dtype_output = np.float32
+        elif self.precision == "fp16":
+            dtype_output = np.float16
+        else:
+            raise NotImplementedError(f"Currently not supports precision: {self.precision}")
+
+        # NMS Outputs
+        output_num_detections = gs.Variable(
+            name="num_detections",
+            dtype=np.int32,
+            shape=[self.batch_size, 1],
+        )  # A scalar indicating the number of valid detections per batch image.
+        output_boxes = gs.Variable(
+            name="detection_boxes",
+            dtype=dtype_output,
+            shape=[self.batch_size, detections_per_img, 4],
+        )
+        output_scores = gs.Variable(
+            name="detection_scores",
+            dtype=dtype_output,
+            shape=[self.batch_size, detections_per_img],
+        )
+        output_labels = gs.Variable(
+            name="detection_classes",
+            dtype=np.int32,
+            shape=[self.batch_size, detections_per_img],
+        )
+
+        op_outputs = [output_num_detections, output_boxes, output_scores, output_labels]
+
+        # Create the NMS Plugin node with the selected inputs. The outputs of the node will also
+        # become the final outputs of the graph.
+        self.graph.layer(op=op, name="batched_nms", inputs=op_inputs, outputs=op_outputs, attrs=attrs)
+        LOGGER.info(f"Created NMS plugin '{op}' with attributes: {attrs}")
+
+        self.graph.outputs = op_outputs
+
+        self.infer()
+
+    def save(self, output_path):
+        """
+        Save the ONNX model to the given location.
+        Args:
+            output_path: Path pointing to the location where to write
+                out the updated ONNX model.
+        """
+        self.graph.cleanup().toposort()
+        model = gs.export_onnx(self.graph)
+        onnx.save(model, output_path)
+        LOGGER.info(f"Saved ONNX model to {output_path}")
\ No newline at end of file