Update for yolov4-full

yalinw00 · Jul 23, 2020 · ecd8231 · ecd8231
1 parent 087052f
commit ecd8231
Showing 1 changed file with 183 additions and 75 deletions.
diff --git a/...oid/app/src/main/java/org/tensorflow/lite/examples/detection/tflite/YoloV4Classifier.java b/...oid/app/src/main/java/org/tensorflow/lite/examples/detection/tflite/YoloV4Classifier.java
@@ -1,11 +1,8 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
     http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -181,10 +178,11 @@ public float getObjThresh() {
     private static boolean isGPU = true;
 
     // tiny or not
-    private static boolean isTiny = true;
+    private static boolean isTiny = false;
 
     // config yolov4 tiny
     private static final int[] OUTPUT_WIDTH_TINY = new int[]{2535, 2535};
+    private static final int[] OUTPUT_WIDTH_FULL = new int[]{10647, 10647};
     private static final int[][] MASKS_TINY = new int[][]{{3, 4, 5}, {1, 2, 3}};
     private static final int[] ANCHORS_TINY = new int[]{
             23, 27, 37, 58, 81, 82, 81, 82, 135, 169, 344, 319};
@@ -304,84 +302,127 @@ protected ByteBuffer convertBitmapToByteBuffer(Bitmap bitmap) {
         return byteBuffer;
     }
 
-    private ArrayList<Recognition> getDetections(ByteBuffer byteBuffer, Bitmap bitmap) {
-        ArrayList<Recognition> detections = new ArrayList<Recognition>();
-        Map<Integer, Object> outputMap = new HashMap<>();
-        for (int i = 0; i < OUTPUT_WIDTH.length; i++) {
-            float[][][][][] out = new float[1][OUTPUT_WIDTH[i]][OUTPUT_WIDTH[i]][3][5 + labels.size()];
-            outputMap.put(i, out);
-        }
+//    private ArrayList<Recognition> getDetections(ByteBuffer byteBuffer, Bitmap bitmap) {
+//        ArrayList<Recognition> detections = new ArrayList<Recognition>();
+//        Map<Integer, Object> outputMap = new HashMap<>();
+//        for (int i = 0; i < OUTPUT_WIDTH.length; i++) {
+//            float[][][][][] out = new float[1][OUTPUT_WIDTH[i]][OUTPUT_WIDTH[i]][3][5 + labels.size()];
+//            outputMap.put(i, out);
+//        }
+//
+//        Log.d("YoloV4Classifier", "mObjThresh: " + getObjThresh());
+//
+//        Object[] inputArray = {byteBuffer};
+//        tfLite.runForMultipleInputsOutputs(inputArray, outputMap);
+//
+//        for (int i = 0; i < OUTPUT_WIDTH.length; i++) {
+//            int gridWidth = OUTPUT_WIDTH[i];
+//            float[][][][][] out = (float[][][][][]) outputMap.get(i);
+//
+//            Log.d("YoloV4Classifier", "out[" + i + "] detect start");
+//            for (int y = 0; y < gridWidth; ++y) {
+//                for (int x = 0; x < gridWidth; ++x) {
+//                    for (int b = 0; b < NUM_BOXES_PER_BLOCK; ++b) {
+//                        final int offset =
+//                                (gridWidth * (NUM_BOXES_PER_BLOCK * (labels.size() + 5))) * y
+//                                        + (NUM_BOXES_PER_BLOCK * (labels.size() + 5)) * x
+//                                        + (labels.size() + 5) * b;
+//
+//                        final float confidence = expit(out[0][y][x][b][4]);
+//                        int detectedClass = -1;
+//                        float maxClass = 0;
+//
+//                        final float[] classes = new float[labels.size()];
+//                        for (int c = 0; c < labels.size(); ++c) {
+//                            classes[c] = out[0][y][x][b][5 + c];
+//                        }
+//
+//                        for (int c = 0; c < labels.size(); ++c) {
+//                            if (classes[c] > maxClass) {
+//                                detectedClass = c;
+//                                maxClass = classes[c];
+//                            }
+//                        }
+//
+//                        final float confidenceInClass = maxClass * confidence;
+//                        if (confidenceInClass > getObjThresh()) {
+////                            final float xPos = (x + (expit(out[0][y][x][b][0]) * XYSCALE[i]) - (0.5f * (XYSCALE[i] - 1))) * (INPUT_SIZE / gridWidth);
+////                            final float yPos = (y + (expit(out[0][y][x][b][1]) * XYSCALE[i]) - (0.5f * (XYSCALE[i] - 1))) * (INPUT_SIZE / gridWidth);
+//
+//                            final float xPos = (x + expit(out[0][y][x][b][0])) * (1.0f * INPUT_SIZE / gridWidth);
+//                            final float yPos = (y + expit(out[0][y][x][b][1])) * (1.0f * INPUT_SIZE / gridWidth);
+//
+//                            final float w = (float) (Math.exp(out[0][y][x][b][2]) * ANCHORS[2 * MASKS[i][b]]);
+//                            final float h = (float) (Math.exp(out[0][y][x][b][3]) * ANCHORS[2 * MASKS[i][b] + 1]);
+//
+//                            final RectF rect =
+//                                    new RectF(
+//                                            Math.max(0, xPos - w / 2),
+//                                            Math.max(0, yPos - h / 2),
+//                                            Math.min(bitmap.getWidth() - 1, xPos + w / 2),
+//                                            Math.min(bitmap.getHeight() - 1, yPos + h / 2));
+//                            detections.add(new Recognition("" + offset, labels.get(detectedClass),
+//                                    confidenceInClass, rect, detectedClass));
+//                        }
+//                    }
+//                }
+//            }
+//            Log.d("YoloV4Classifier", "out[" + i + "] detect end");
+//        }
+//        return detections;
+//    }
 
-        Log.d("YoloV4Classifier", "mObjThresh: " + getObjThresh());
+    /**
+     * For yolov4-tiny, the situation would be a little different from the yolov4, it only has two
+     * output. Both has three dimenstion. The first one is a tensor with dimension [1, 2535,4], containing all the bounding boxes.
+     * The second one is a tensor with dimension [1, 2535, class_num], containing all the classes score.
+     * @param byteBuffer input ByteBuffer, which contains the image information
+     * @param bitmap pixel disenty used to resize the output images
+     * @return an array list containing the recognitions
+     */
 
+    private ArrayList<Recognition> getDetectionsForFull(ByteBuffer byteBuffer, Bitmap bitmap) {
+        ArrayList<Recognition> detections = new ArrayList<Recognition>();
+        Map<Integer, Object> outputMap = new HashMap<>();
+        outputMap.put(0, new float[1][OUTPUT_WIDTH_FULL[0]][4]);
+        outputMap.put(1, new float[1][OUTPUT_WIDTH_FULL[1]][labels.size()]);
         Object[] inputArray = {byteBuffer};
         tfLite.runForMultipleInputsOutputs(inputArray, outputMap);
 
-        for (int i = 0; i < OUTPUT_WIDTH.length; i++) {
-            int gridWidth = OUTPUT_WIDTH[i];
-            float[][][][][] out = (float[][][][][]) outputMap.get(i);
-
-            Log.d("YoloV4Classifier", "out[" + i + "] detect start");
-            for (int y = 0; y < gridWidth; ++y) {
-                for (int x = 0; x < gridWidth; ++x) {
-                    for (int b = 0; b < NUM_BOXES_PER_BLOCK; ++b) {
-                        final int offset =
-                                (gridWidth * (NUM_BOXES_PER_BLOCK * (labels.size() + 5))) * y
-                                        + (NUM_BOXES_PER_BLOCK * (labels.size() + 5)) * x
-                                        + (labels.size() + 5) * b;
-
-                        final float confidence = expit(out[0][y][x][b][4]);
-                        int detectedClass = -1;
-                        float maxClass = 0;
-
-                        final float[] classes = new float[labels.size()];
-                        for (int c = 0; c < labels.size(); ++c) {
-                            classes[c] = out[0][y][x][b][5 + c];
-                        }
-
-                        for (int c = 0; c < labels.size(); ++c) {
-                            if (classes[c] > maxClass) {
-                                detectedClass = c;
-                                maxClass = classes[c];
-                            }
-                        }
-
-                        final float confidenceInClass = maxClass * confidence;
-                        if (confidenceInClass > getObjThresh()) {
-//                            final float xPos = (x + (expit(out[0][y][x][b][0]) * XYSCALE[i]) - (0.5f * (XYSCALE[i] - 1))) * (INPUT_SIZE / gridWidth);
-//                            final float yPos = (y + (expit(out[0][y][x][b][1]) * XYSCALE[i]) - (0.5f * (XYSCALE[i] - 1))) * (INPUT_SIZE / gridWidth);
-
-                            final float xPos = (x + expit(out[0][y][x][b][0])) * (1.0f * INPUT_SIZE / gridWidth);
-                            final float yPos = (y + expit(out[0][y][x][b][1])) * (1.0f * INPUT_SIZE / gridWidth);
-
-                            final float w = (float) (Math.exp(out[0][y][x][b][2]) * ANCHORS[2 * MASKS[i][b]]);
-                            final float h = (float) (Math.exp(out[0][y][x][b][3]) * ANCHORS[2 * MASKS[i][b] + 1]);
-
-                            final RectF rect =
-                                    new RectF(
-                                            Math.max(0, xPos - w / 2),
-                                            Math.max(0, yPos - h / 2),
-                                            Math.min(bitmap.getWidth() - 1, xPos + w / 2),
-                                            Math.min(bitmap.getHeight() - 1, yPos + h / 2));
-                            detections.add(new Recognition("" + offset, labels.get(detectedClass),
-                                    confidenceInClass, rect, detectedClass));
-                        }
-                    }
+        int gridWidth = OUTPUT_WIDTH_FULL[0];
+        float[][][] bboxes = (float [][][]) outputMap.get(0);
+        float[][][] out_score = (float[][][]) outputMap.get(1);
+
+        for (int i = 0; i < gridWidth;i++){
+            float maxClass = 0;
+            int detectedClass = -1;
+            final float[] classes = new float[labels.size()];
+            for (int c = 0;c< labels.size();c++){
+                classes [c] = out_score[0][i][c];
+            }
+            for (int c = 0;c<labels.size();++c){
+                if (classes[c] > maxClass){
+                    detectedClass = c;
+                    maxClass = classes[c];
                 }
             }
-            Log.d("YoloV4Classifier", "out[" + i + "] detect end");
+            final float score = maxClass;
+            if (score > getObjThresh()){
+                final float xPos = bboxes[0][i][0];
+                final float yPos = bboxes[0][i][1];
+                final float w = bboxes[0][i][2];
+                final float h = bboxes[0][i][3];
+                final RectF rectF = new RectF(
+                        Math.max(0, xPos - w / 2),
+                        Math.max(0, yPos - h / 2),
+                        Math.min(bitmap.getWidth() - 1, xPos + w / 2),
+                        Math.min(bitmap.getHeight() - 1, yPos + h / 2));
+                detections.add(new Recognition("" + i, labels.get(detectedClass),score,rectF,detectedClass ));
+            }
         }
         return detections;
     }
 
-    /**
-     * For yolov4-tiny, the situation would be a little different from the yolov4, it only has two
-     * output. Both has three dimenstion. The first one is a tensor with dimension [1, 2535,4], containing all the bounding boxes.
-     * The second one is a tensor with dimension [1, 2535, class_num], containing all the classes score.
-     * @param byteBuffer input ByteBuffer, which contains the image information
-     * @param bitmap pixel disenty used to resize the output images
-     * @return an array list containing the recognitions
-     */
     private ArrayList<Recognition> getDetectionsForTiny(ByteBuffer byteBuffer, Bitmap bitmap) {
         ArrayList<Recognition> detections = new ArrayList<Recognition>();
         Map<Integer, Object> outputMap = new HashMap<>();
@@ -418,20 +459,87 @@ private ArrayList<Recognition> getDetectionsForTiny(ByteBuffer byteBuffer, Bitma
                         Math.max(0, yPos - h / 2),
                         Math.min(bitmap.getWidth() - 1, xPos + w / 2),
                         Math.min(bitmap.getHeight() - 1, yPos + h / 2));
-            detections.add(new Recognition("" + i, labels.get(detectedClass),score,rectF,detectedClass ));
+                detections.add(new Recognition("" + i, labels.get(detectedClass),score,rectF,detectedClass ));
             }
         }
         return detections;
     }
 
     public ArrayList<Recognition> recognizeImage(Bitmap bitmap) {
         ByteBuffer byteBuffer = convertBitmapToByteBuffer(bitmap);
+
+//        Map<Integer, Object> outputMap = new HashMap<>();
+//        for (int i = 0; i < OUTPUT_WIDTH.length; i++) {
+//            float[][][][][] out = new float[1][OUTPUT_WIDTH[i]][OUTPUT_WIDTH[i]][3][5 + labels.size()];
+//            outputMap.put(i, out);
+//        }
+//
+//        Log.d("YoloV4Classifier", "mObjThresh: " + getObjThresh());
+//
+//        Object[] inputArray = {byteBuffer};
+//        tfLite.runForMultipleInputsOutputs(inputArray, outputMap);
+//
+//        ArrayList<Recognition> detections = new ArrayList<Recognition>();
+//
+//        for (int i = 0; i < OUTPUT_WIDTH.length; i++) {
+//            int gridWidth = OUTPUT_WIDTH[i];
+//            float[][][][][] out = (float[][][][][]) outputMap.get(i);
+//
+//            Log.d("YoloV4Classifier", "out[" + i + "] detect start");
+//            for (int y = 0; y < gridWidth; ++y) {
+//                for (int x = 0; x < gridWidth; ++x) {
+//                    for (int b = 0; b < NUM_BOXES_PER_BLOCK; ++b) {
+//                        final int offset =
+//                                (gridWidth * (NUM_BOXES_PER_BLOCK * (labels.size() + 5))) * y
+//                                        + (NUM_BOXES_PER_BLOCK * (labels.size() + 5)) * x
+//                                        + (labels.size() + 5) * b;
+//
+//                        final float confidence = expit(out[0][y][x][b][4]);
+//                        int detectedClass = -1;
+//                        float maxClass = 0;
+//
+//                        final float[] classes = new float[labels.size()];
+//                        for (int c = 0; c < labels.size(); ++c) {
+//                            classes[c] = out[0][y][x][b][5 + c];
+//                        }
+//
+//                        for (int c = 0; c < labels.size(); ++c) {
+//                            if (classes[c] > maxClass) {
+//                                detectedClass = c;
+//                                maxClass = classes[c];
+//                            }
+//                        }
+//
+//                        final float confidenceInClass = maxClass * confidence;
+//                        if (confidenceInClass > getObjThresh()) {
+////                            final float xPos = (x + (expit(out[0][y][x][b][0]) * XYSCALE[i]) - (0.5f * (XYSCALE[i] - 1))) * (INPUT_SIZE / gridWidth);
+////                            final float yPos = (y + (expit(out[0][y][x][b][1]) * XYSCALE[i]) - (0.5f * (XYSCALE[i] - 1))) * (INPUT_SIZE / gridWidth);
+//
+//                            final float xPos = (x + expit(out[0][y][x][b][0])) * (1.0f * INPUT_SIZE / gridWidth);
+//                            final float yPos = (y + expit(out[0][y][x][b][1])) * (1.0f * INPUT_SIZE / gridWidth);
+//
+//                            final float w = (float) (Math.exp(out[0][y][x][b][2]) * ANCHORS[2 * MASKS[i][b]]);
+//                            final float h = (float) (Math.exp(out[0][y][x][b][3]) * ANCHORS[2 * MASKS[i][b] + 1]);
+//
+//                            final RectF rect =
+//                                    new RectF(
+//                                            Math.max(0, xPos - w / 2),
+//                                            Math.max(0, yPos - h / 2),
+//                                            Math.min(bitmap.getWidth() - 1, xPos + w / 2),
+//                                            Math.min(bitmap.getHeight() - 1, yPos + h / 2));
+//                            detections.add(new Recognition("" + offset, labels.get(detectedClass),
+//                                    confidenceInClass, rect, detectedClass));
+//                        }
+//                    }
+//                }
+//            }
+//            Log.d("YoloV4Classifier", "out[" + i + "] detect end");
+//        }
         ArrayList<Recognition> detections;
-        //check whether the tiny version is specified
         if (isTiny) {
             detections = getDetectionsForTiny(byteBuffer, bitmap);
         } else {
-            detections = getDetections(byteBuffer, bitmap);
+            detections = getDetectionsForFull(byteBuffer, bitmap);
         }
         final ArrayList<Recognition> recognitions = nms(detections);
         return recognitions;
@@ -488,4 +596,4 @@ public boolean checkInvalidateBox(float x, float y, float width, float height, f
 
         return true;
     }
-}
+}