Merge pull request WZMIAOMIAO#39 from WZMIAOMIAO/dev

Dev
wangran95 · Jul 24, 2020 · 0636048 · 0636048
2 parents 1e7e3aa + 606a8d2
commit 0636048
Show file tree

Hide file tree

Showing 3 changed files with 53 additions and 38 deletions.
diff --git a/pytorch_object_detection/ssd/README.md b/pytorch_object_detection/ssd/README.md
@@ -39,7 +39,7 @@
 * 若要使用多GPU训练，使用 "python -m torch.distributed.launch --nproc_per_node=8 --use_env train_multi_GPU.py" 指令,nproc_per_node参数为使用GPU数量
 
 ## 如果对SSD算法原理不是很理解可参考我的bilibili
-* https://b23.tv/GJnkOD
+* https://www.bilibili.com/video/BV1fT4y1L7Gi
 
 ## 进一步了解该项目，以及对SSD算法代码的分析可参考我的bilibili
 * https://www.bilibili.com/video/BV1vK411H771/

diff --git a/pytorch_object_detection/ssd/src/ssd_model.py b/pytorch_object_detection/ssd/src/ssd_model.py
@@ -146,29 +146,28 @@ class Loss(nn.Module):
     """
     def __init__(self, dboxes):
         super(Loss, self).__init__()
-        self.scale_xy = 1.0 / dboxes.scale_xy
-        self.scale_wh = 1.0 / dboxes.scale_wh
+        # Two factor are from following links
+        # http://jany.st/post/2017-11-05-single-shot-detector-ssd-from-scratch-in-tensorflow.html
+        self.scale_xy = 1.0 / dboxes.scale_xy  # 10
+        self.scale_wh = 1.0 / dboxes.scale_wh  # 5
 
         self.location_loss = nn.SmoothL1Loss(reduction='none')
-        # self.location_loss = nn.SmoothL1Loss(reduce=False)
+        # [num_anchors, 4] -> [4, num_anchors] -> [1, 4, num_anchors]
         self.dboxes = nn.Parameter(dboxes(order="xywh").transpose(0, 1).unsqueeze(dim=0),
                                    requires_grad=False)
 
-        # Two factor are from following links
-        # http://jany.st/post/2017-11-05-single-shot-detector-ssd-from-scratch-in-tensorflow.html
         self.confidence_loss = nn.CrossEntropyLoss(reduction='none')
-        # self.confidence_loss = nn.CrossEntropyLoss(reduce=False)
 
     def _location_vec(self, loc):
         # type: (Tensor)
         """
         Generate Location Vectors
         计算ground truth相对anchors的回归参数
-        :param loc:
+        :param loc: anchor匹配到的对应GTBOX Nx4x8732
         :return:
         """
-        gxy = self.scale_xy * (loc[:, :2, :] - self.dboxes[:, :2, :]) / self.dboxes[:, 2:, :]
-        gwh = self.scale_wh * (loc[:, 2:, :] / self.dboxes[:, 2:, :]).log()
+        gxy = self.scale_xy * (loc[:, :2, :] - self.dboxes[:, :2, :]) / self.dboxes[:, 2:, :]  # Nx2x8732
+        gwh = self.scale_wh * (loc[:, 2:, :] / self.dboxes[:, 2:, :]).log()  # Nx2x8732
         return torch.cat((gxy, gwh), dim=1).contiguous()
 
     def forward(self, ploc, plabel, gloc, glabel):
@@ -217,8 +216,9 @@ def forward(self, ploc, plabel, gloc, glabel):
         # avoid no object detected
         # 避免出现图像中没有GTBOX的情况
         total_loss = loc_loss + con_loss
-        num_mask = (pos_num > 0).float()  # 统计一个batch中的每张图像中是否存在GTBOX
+        # eg. [15, 3, 5, 0] -> [1.0, 1.0, 1.0, 0.0]
+        num_mask = (pos_num > 0).float()  # 统计一个batch中的每张图像中是否存在正样本
         pos_num = pos_num.float().clamp(min=1e-6)  # 防止出现分母为零的情况
-        ret = (total_loss * num_mask / pos_num).mean(dim=0)  # 只计算存在GTBOX的图像损失
+        ret = (total_loss * num_mask / pos_num).mean(dim=0)  # 只计算存在正样本的图像损失
         return ret
 
diff --git a/pytorch_object_detection/ssd/src/utils.py b/pytorch_object_detection/ssd/src/utils.py
@@ -339,18 +339,23 @@ def decode_single(self, bboxes_in, scores_in, criteria, max_output, max_num=200)
 
 class DefaultBoxes(object):
     def __init__(self, fig_size, feat_size, steps, scales, aspect_ratios, scale_xy=0.1, scale_wh=0.2):
-        self.fig_size = fig_size   # 输入网络的图像大小
+        self.fig_size = fig_size   # 输入网络的图像大小 300
+        # [38, 19, 10, 5, 3, 1]
         self.feat_size = feat_size  # 每个预测层的feature map尺寸
 
         self.scale_xy_ = scale_xy
         self.scale_wh_ = scale_wh
 
         # According to https://github.com/weiliu89/caffe
         # Calculation method slightly different from paper
+        # [8, 16, 32, 64, 100, 300]
         self.steps = steps    # 每个特征层上的一个cell在原图上的跨度
+
+        # [21, 45, 99, 153, 207, 261, 315]
         self.scales = scales  # 每个特征层上预测的default box的scale
 
         fk = fig_size / np.array(steps)     # 计算每层特征层的fk
+        # [[2], [2, 3], [2, 3], [2, 3], [2], [2]]
         self.aspect_ratios = aspect_ratios  # 每个预测特征层上预测的default box的ratios
 
         self.default_boxes = []
@@ -376,17 +381,18 @@ def __init__(self, fig_size, feat_size, steps, scales, aspect_ratios, scale_xy=0
                     cx, cy = (j + 0.5) / fk[idx], (i + 0.5) / fk[idx]
                     self.default_boxes.append((cx, cy, w, h))
 
-            self.dboxes = torch.tensor(self.default_boxes, dtype=torch.float32)  # 这里不转类型会报错
-            self.dboxes.clamp_(min=0, max=1)  # 将坐标（x, y, w, h）都限制在0-1之间
+        # 将default_boxes转为tensor格式
+        self.dboxes = torch.tensor(self.default_boxes, dtype=torch.float32)  # 这里不转类型会报错
+        self.dboxes.clamp_(min=0, max=1)  # 将坐标（x, y, w, h）都限制在0-1之间
 
-            # For IoU calculation
-            # ltrb is left top coordinate and right bottom coordinate
-            # 将(x, y, w, h)转换成(xmin, ymin, xmax, ymax)，方便后续计算IoU(匹配正负样本时)
-            self.dboxes_ltrb = self.dboxes.clone()
-            self.dboxes_ltrb[:, 0] = self.dboxes[:, 0] - 0.5 * self.dboxes[:, 2]
-            self.dboxes_ltrb[:, 1] = self.dboxes[:, 1] - 0.5 * self.dboxes[:, 3]
-            self.dboxes_ltrb[:, 2] = self.dboxes[:, 0] + 0.5 * self.dboxes[:, 2]
-            self.dboxes_ltrb[:, 3] = self.dboxes[:, 1] + 0.5 * self.dboxes[:, 3]
+        # For IoU calculation
+        # ltrb is left top coordinate and right bottom coordinate
+        # 将(x, y, w, h)转换成(xmin, ymin, xmax, ymax)，方便后续计算IoU(匹配正负样本时)
+        self.dboxes_ltrb = self.dboxes.clone()
+        self.dboxes_ltrb[:, 0] = self.dboxes[:, 0] - 0.5 * self.dboxes[:, 2]   # xmin
+        self.dboxes_ltrb[:, 1] = self.dboxes[:, 1] - 0.5 * self.dboxes[:, 3]   # ymin
+        self.dboxes_ltrb[:, 2] = self.dboxes[:, 0] + 0.5 * self.dboxes[:, 2]   # xmax
+        self.dboxes_ltrb[:, 3] = self.dboxes[:, 1] + 0.5 * self.dboxes[:, 3]   # ymax
 
     @property
     def scale_xy(self):
@@ -498,27 +504,32 @@ def batched_nms(boxes, scores, idxs, iou_threshold):
 class PostProcess(nn.Module):
     def __init__(self, dboxes):
         super(PostProcess, self).__init__()
+        # [num_anchors, 4] -> [1, num_anchors, 4]
         self.dboxes_xywh = nn.Parameter(dboxes(order='xywh').unsqueeze(dim=0),
                                         requires_grad=False)
-        self.scale_xy = dboxes.scale_xy
-        self.scale_wh = dboxes.scale_wh
+        self.scale_xy = dboxes.scale_xy  # 0.1
+        self.scale_wh = dboxes.scale_wh  # 0.2
 
         self.criteria = 0.5
         self.max_output = 100
 
     def scale_back_batch(self, bboxes_in, scores_in):
         # type: (Tensor, Tensor)
         """
-            将box格式从xywh转换回ltrb, 将预测目标score通过softmax处理
+            1）通过预测的boxes回归参数得到最终预测坐标
+            2）将box格式从xywh转换回ltrb
+            3）将预测目标score通过softmax处理
             Do scale and transform from xywh to ltrb
             suppose input N x 4 x num_bbox | N x label_num x num_bbox
 
-            bboxes_in: 是网络预测的xywh回归参数
-            scores_in: 是预测的每个default box的各目标概率
+            bboxes_in: [N, 4, 8732]是网络预测的xywh回归参数
+            scores_in: [N, label_num, 8732]是预测的每个default box的各目标概率
         """
 
         # Returns a view of the original tensor with its dimensions permuted.
+        # [batch, 4, 8732] -> [batch, 8732, 4]
         bboxes_in = bboxes_in.permute(0, 2, 1)
+        # [batch, label_num, 8732] -> [batch, 8732, label_num]
         scores_in = scores_in.permute(0, 2, 1)
         # print(bboxes_in.is_contiguous())
 
@@ -540,6 +551,7 @@ def scale_back_batch(self, bboxes_in, scores_in):
         bboxes_in[:, :, 2] = r  # xmax
         bboxes_in[:, :, 3] = b  # ymax
 
+        # scores_in: [batch, 8732, label_num]
         return bboxes_in, F.softmax(scores_in, dim=-1)
 
     def decode_single_new(self, bboxes_in, scores_in, criteria, num_output):
@@ -562,27 +574,28 @@ def decode_single_new(self, bboxes_in, scores_in, criteria, num_output):
 
         # create labels for each prediction
         labels = torch.arange(num_classes, device=device)
+        # [num_classes] -> [8732, num_classes]
         labels = labels.view(1, -1).expand_as(scores_in)
 
         # remove prediction with the background label
         # 移除归为背景类别的概率信息
-        bboxes_in = bboxes_in[:, 1:, :]
-        scores_in = scores_in[:, 1:]
-        labels = labels[:, 1:]
+        bboxes_in = bboxes_in[:, 1:, :]  # [8732, 21, 4] -> [8732, 20, 4]
+        scores_in = scores_in[:, 1:]  # [8732, 21] -> [8732, 20]
+        labels = labels[:, 1:]  # [8732, 21] -> [8732, 20]
 
         # batch everything, by making every class prediction be a separate instance
-        bboxes_in = bboxes_in.reshape(-1, 4)
-        scores_in = scores_in.reshape(-1)
-        labels = labels.reshape(-1)
+        bboxes_in = bboxes_in.reshape(-1, 4)  # [8732, 20, 4] -> [8732x20, 4]
+        scores_in = scores_in.reshape(-1)  # [8732, 20] -> [8732x20]
+        labels = labels.reshape(-1)  # [8732, 20] -> [8732x20]
 
         # remove low scoring boxes
         # 移除低概率目标，self.scores_thresh=0.05
         inds = torch.nonzero(scores_in > 0.05).squeeze(1)
-        bboxes_in, scores_in, labels = bboxes_in[inds], scores_in[inds], labels[inds]
+        bboxes_in, scores_in, labels = bboxes_in[inds, :], scores_in[inds], labels[inds]
 
         # remove empty boxes
         ws, hs = bboxes_in[:, 2] - bboxes_in[:, 0], bboxes_in[:, 3] - bboxes_in[:, 1]
-        keep = (ws >= 0.1 / 300) & (hs >= 0.1 / 300)
+        keep = (ws >= 1 / 300) & (hs >= 1 / 300)
         keep = keep.nonzero().squeeze(1)
         bboxes_in, scores_in, labels = bboxes_in[keep], scores_in[keep], labels[keep]
 
@@ -598,12 +611,14 @@ def decode_single_new(self, bboxes_in, scores_in, criteria, num_output):
         return bboxes_out, labels_out, scores_out
 
     def forward(self, bboxes_in, scores_in):
-        # 将box格式从xywh转换回ltrb（方便后面非极大值抑制时求iou）, 将预测目标score通过softmax处理
+        # 通过预测的boxes回归参数得到最终预测坐标, 将预测目标score通过softmax处理
         bboxes, probs = self.scale_back_batch(bboxes_in, scores_in)
 
         outputs = torch.jit.annotate(List[Tuple[Tensor, Tensor, Tensor]], [])
         # 遍历一个batch中的每张image数据
-        for bbox, prob in zip(bboxes.split(1, 0), probs.split(1, 0)):
+        # bboxes: [batch, 8732, 4]
+        for bbox, prob in zip(bboxes.split(1, 0), probs.split(1, 0)):  # split_size, split_dim
+            # bbox: [1, 8732, 4]
             bbox = bbox.squeeze(0)
             prob = prob.squeeze(0)
             outputs.append(self.decode_single_new(bbox, prob, self.criteria, self.max_output))