Faster-RCNN object detection models from TensorFlow

shadowkun · May 30, 2018 · bf87a43 · bf87a43
1 parent 44572fa
commit bf87a43
Show file tree

Hide file tree

Showing 8 changed files with 457 additions and 5 deletions.
diff --git a/modules/dnn/include/opencv2/dnn/all_layers.hpp b/modules/dnn/include/opencv2/dnn/all_layers.hpp
@@ -581,6 +581,12 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
         static Ptr<ProposalLayer> create(const LayerParams& params);
     };
 
+    class CV_EXPORTS CropAndResizeLayer : public Layer
+    {
+    public:
+        static Ptr<Layer> create(const LayerParams& params);
+    };
+
 //! @}
 //! @}
 CV__DNN_EXPERIMENTAL_NS_END

diff --git a/modules/dnn/src/init.cpp b/modules/dnn/src/init.cpp
@@ -84,6 +84,7 @@ void initializeLayerFactory()
     CV_DNN_REGISTER_LAYER_CLASS(Reshape,        ReshapeLayer);
     CV_DNN_REGISTER_LAYER_CLASS(Flatten,        FlattenLayer);
     CV_DNN_REGISTER_LAYER_CLASS(ResizeNearestNeighbor, ResizeNearestNeighborLayer);
+    CV_DNN_REGISTER_LAYER_CLASS(CropAndResize,  CropAndResizeLayer);
 
     CV_DNN_REGISTER_LAYER_CLASS(Convolution,    ConvolutionLayer);
     CV_DNN_REGISTER_LAYER_CLASS(Deconvolution,  DeconvolutionLayer);

diff --git a/modules/dnn/src/layers/crop_and_resize_layer.cpp b/modules/dnn/src/layers/crop_and_resize_layer.cpp
@@ -0,0 +1,108 @@
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+
+namespace cv { namespace dnn {
+
+class CropAndResizeLayerImpl CV_FINAL : public CropAndResizeLayer
+{
+public:
+    CropAndResizeLayerImpl(const LayerParams& params)
+    {
+        CV_Assert(params.has("width"), params.has("height"));
+        outWidth = params.get<float>("width");
+        outHeight = params.get<float>("height");
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        CV_Assert(inputs.size() == 2, inputs[0].size() == 4);
+        if (inputs[0][0] != 1)
+            CV_Error(Error::StsNotImplemented, "");
+        outputs.resize(1, MatShape(4));
+        outputs[0][0] = inputs[1][2];  // Number of bounding boxes.
+        outputs[0][1] = inputs[0][1];  // Number of channels.
+        outputs[0][2] = outHeight;
+        outputs[0][3] = outWidth;
+        return false;
+    }
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
+    }
+
+    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        Mat& inp = *inputs[0];
+        Mat& out = outputs[0];
+        Mat boxes = inputs[1]->reshape(1, inputs[1]->total() / 7);
+        const int numChannels = inp.size[1];
+        const int inpHeight = inp.size[2];
+        const int inpWidth = inp.size[3];
+        const int inpSpatialSize = inpHeight * inpWidth;
+        const int outSpatialSize = outHeight * outWidth;
+        CV_Assert(inp.isContinuous(), out.isContinuous());
+
+        for (int b = 0; b < boxes.rows; ++b)
+        {
+            float* outDataBox = out.ptr<float>(b);
+            float left = boxes.at<float>(b, 3);
+            float top = boxes.at<float>(b, 4);
+            float right = boxes.at<float>(b, 5);
+            float bottom = boxes.at<float>(b, 6);
+            float boxWidth = right - left;
+            float boxHeight = bottom - top;
+
+            float heightScale = boxHeight * static_cast<float>(inpHeight - 1) / (outHeight - 1);
+            float widthScale = boxWidth * static_cast<float>(inpWidth - 1) / (outWidth - 1);
+            for (int y = 0; y < outHeight; ++y)
+            {
+                float input_y = top * (inpHeight - 1) + y * heightScale;
+                int y0 = static_cast<int>(input_y);
+                const float* inpData_row0 = (float*)inp.data + y0 * inpWidth;
+                const float* inpData_row1 = (y0 + 1 < inpHeight) ? (inpData_row0 + inpWidth) : inpData_row0;
+                for (int x = 0; x < outWidth; ++x)
+                {
+                    float input_x = left * (inpWidth - 1) + x * widthScale;
+                    int x0 = static_cast<int>(input_x);
+                    int x1 = std::min(x0 + 1, inpWidth - 1);
+
+                    float* outData = outDataBox + y * outWidth + x;
+                    const float* inpData_row0_c = inpData_row0;
+                    const float* inpData_row1_c = inpData_row1;
+                    for (int c = 0; c < numChannels; ++c)
+                    {
+                        *outData = inpData_row0_c[x0] +
+                            (input_y - y0) * (inpData_row1_c[x0] - inpData_row0_c[x0]) +
+                            (input_x - x0) * (inpData_row0_c[x1] - inpData_row0_c[x0] +
+                            (input_y - y0) * (inpData_row1_c[x1] - inpData_row0_c[x1] - inpData_row1_c[x0] + inpData_row0_c[x0]));
+
+                        inpData_row0_c += inpSpatialSize;
+                        inpData_row1_c += inpSpatialSize;
+                        outData += outSpatialSize;
+                    }
+                }
+            }
+        }
+    }
+
+private:
+    int outWidth, outHeight;
+};
+
+Ptr<Layer> CropAndResizeLayer::create(const LayerParams& params)
+{
+    return Ptr<CropAndResizeLayer>(new CropAndResizeLayerImpl(params));
+}
+
+}  // namespace dnn
+}  // namespace cv
diff --git a/modules/dnn/src/layers/detection_output_layer.cpp b/modules/dnn/src/layers/detection_output_layer.cpp
@@ -208,8 +208,9 @@ class DetectionOutputLayerImpl CV_FINAL : public DetectionOutputLayer
         CV_Assert(inputs[0][0] == inputs[1][0]);
 
         int numPriors = inputs[2][2] / 4;
-        CV_Assert((numPriors * _numLocClasses * 4) == inputs[0][1]);
-        CV_Assert(int(numPriors * _numClasses) == inputs[1][1]);
+        CV_Assert((numPriors * _numLocClasses * 4) == total(inputs[0], 1));
+        CV_Assert(int(numPriors * _numClasses) == total(inputs[1], 1));
+        CV_Assert(inputs[2][1] == 1 + (int)(!_varianceEncodedInTarget));
 
         // num() and channels() are 1.
         // Since the number of bboxes to be kept is unknown before nms, we manually

diff --git a/modules/dnn/src/tensorflow/tf_importer.cpp b/modules/dnn/src/tensorflow/tf_importer.cpp
@@ -1094,9 +1094,9 @@ void TFImporter::populateNet(Net dstNet)
             CV_Assert(!begins.empty(), !sizes.empty(), begins.type() == CV_32SC1,
                       sizes.type() == CV_32SC1);
 
-            if (begins.total() == 4)
+            if (begins.total() == 4 && data_layouts[name] == DATA_LAYOUT_NHWC)
             {
-                // Perhabs, we have an NHWC order. Swap it to NCHW.
+                // Swap NHWC parameters' order to NCHW.
                 std::swap(*begins.ptr<int32_t>(0, 2), *begins.ptr<int32_t>(0, 3));
                 std::swap(*begins.ptr<int32_t>(0, 1), *begins.ptr<int32_t>(0, 2));
                 std::swap(*sizes.ptr<int32_t>(0, 2), *sizes.ptr<int32_t>(0, 3));
@@ -1176,6 +1176,9 @@ void TFImporter::populateNet(Net dstNet)
                        layers_to_ignore.insert(next_layers[0].first);
                    }
 
+                    if (hasLayerAttr(layer, "axis"))
+                        layerParams.set("axis", getLayerAttr(layer, "axis").i());
+
                     id = dstNet.addLayer(name, "Scale", layerParams);
                 }
                 layer_id[name] = id;
@@ -1547,6 +1550,10 @@ void TFImporter::populateNet(Net dstNet)
                 layerParams.set("confidence_threshold", getLayerAttr(layer, "confidence_threshold").f());
             if (hasLayerAttr(layer, "loc_pred_transposed"))
                 layerParams.set("loc_pred_transposed", getLayerAttr(layer, "loc_pred_transposed").b());
+            if (hasLayerAttr(layer, "clip"))
+                layerParams.set("clip", getLayerAttr(layer, "clip").b());
+            if (hasLayerAttr(layer, "variance_encoded_in_target"))
+                layerParams.set("variance_encoded_in_target", getLayerAttr(layer, "variance_encoded_in_target").b());
 
             int id = dstNet.addLayer(name, "DetectionOutput", layerParams);
             layer_id[name] = id;
@@ -1563,6 +1570,26 @@ void TFImporter::populateNet(Net dstNet)
             layer_id[name] = id;
             connectToAllBlobs(layer_id, dstNet, parsePin(layer.input(0)), id, layer.input_size());
         }
+        else if (type == "CropAndResize")
+        {
+            // op: "CropAndResize"
+            // input: "input"
+            // input: "boxes"
+            // input: "sizes"
+            CV_Assert(layer.input_size() == 3);
+
+            Mat cropSize = getTensorContent(getConstBlob(layer, value_id, 2));
+            CV_Assert(cropSize.type() == CV_32SC1, cropSize.total() == 2);
+
+            layerParams.set("height", cropSize.at<int>(0));
+            layerParams.set("width", cropSize.at<int>(1));
+
+            int id = dstNet.addLayer(name, "CropAndResize", layerParams);
+            layer_id[name] = id;
+
+            connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0);
+            connect(layer_id, dstNet, parsePin(layer.input(1)), id, 1);
+        }
         else if (type == "Mean")
         {
             Mat indices = getTensorContent(getConstBlob(layer, value_id, 1));

diff --git a/modules/dnn/test/test_tf_importer.cpp b/modules/dnn/test/test_tf_importer.cpp
@@ -270,6 +270,22 @@ TEST_P(Test_TensorFlow_nets, Inception_v2_SSD)
     normAssertDetections(ref, out, "", 0.5);
 }
 
+TEST_P(Test_TensorFlow_nets, Inception_v2_Faster_RCNN)
+{
+    std::string proto = findDataFile("dnn/faster_rcnn_inception_v2_coco_2018_01_28.pbtxt", false);
+    std::string model = findDataFile("dnn/faster_rcnn_inception_v2_coco_2018_01_28.pb", false);
+
+    Net net = readNetFromTensorflow(model, proto);
+    Mat img = imread(findDataFile("dnn/dog416.png", false));
+    Mat blob = blobFromImage(img, 1.0f / 127.5, Size(800, 600), Scalar(127.5, 127.5, 127.5), true, false);
+
+    net.setInput(blob);
+    Mat out = net.forward();
+
+    Mat ref = blobFromNPY(findDataFile("dnn/tensorflow/faster_rcnn_inception_v2_coco_2018_01_28.detection_out.npy"));
+    normAssertDetections(ref, out, "", 0.3);
+}
+
 TEST_P(Test_TensorFlow_nets, opencv_face_detector_uint8)
 {
     std::string proto = findDataFile("dnn/opencv_face_detector.pbtxt", false);

diff --git a/samples/dnn/README.md b/samples/dnn/README.md
@@ -11,8 +11,10 @@
 | [SSDs from TensorFlow](https://github.com/tensorflow/models/tree/master/research/object_detection/) | `0.00784 (2/255)` | `300x300` | `127.5 127.5 127.5` | RGB |
 | [YOLO](https://pjreddie.com/darknet/yolo/) | `0.00392 (1/255)` | `416x416` | `0 0 0` | RGB |
 | [VGG16-SSD](https://github.com/weiliu89/caffe/tree/ssd) | `1.0` | `300x300` | `104 117 123` | BGR |
-| [Faster-RCNN](https://github.com/rbgirshick/py-faster-rcnn) | `1.0` | `800x600` | `102.9801, 115.9465, 122.7717` | BGR |
+| [Faster-RCNN](https://github.com/rbgirshick/py-faster-rcnn) | `1.0` | `800x600` | `102.9801 115.9465 122.7717` | BGR |
 | [R-FCN](https://github.com/YuwenXiong/py-R-FCN) | `1.0` | `800x600` | `102.9801 115.9465 122.7717` | BGR |
+| [Faster-RCNN, ResNet backbone](https://github.com/tensorflow/models/tree/master/research/object_detection/) | `1.0` | `300x300` | `103.939 116.779 123.68` | RGB |
+| [Faster-RCNN, InceptionV2 backbone](https://github.com/tensorflow/models/tree/master/research/object_detection/) | `0.00784 (2/255)` | `300x300` | `127.5 127.5 127.5` | RGB |
 
 #### Face detection
 [An origin model](https://github.com/opencv/opencv/tree/master/samples/dnn/face_detector)