{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# 9.4 锚框" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1.2.0\n" ] } ], "source": [ "%matplotlib inline\n", "from PIL import Image\n", "import numpy as np\n", "import math\n", "import torch\n", "\n", "import sys\n", "sys.path.append(\"..\") \n", "import d2lzh_pytorch as d2l\n", "print(torch.__version__)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 9.4.1 生成多个锚框" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "w = 728, h = 561\n" ] } ], "source": [ "d2l.set_figsize()\n", "img = Image.open('../../img/catdog.jpg')\n", "w, h = img.size\n", "print(\"w = %d, h = %d\" % (w, h))\n", "\n", "# d2l.plt.imshow(img); # 加分号只显示图" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# 本函数已保存在d2lzh_pytorch包中方便以后使用\n", "def MultiBoxPrior(feature_map, sizes=[0.75, 0.5, 0.25], ratios=[1, 2, 0.5]):\n", " \"\"\"\n", " # 按照「9.4.1. 生成多个锚框」所讲的实现, anchor表示成(xmin, ymin, xmax, ymax).\n", " https://zh.d2l.ai/chapter_computer-vision/anchor.html\n", " Args:\n", " feature_map: torch tensor, Shape: [N, C, H, W].\n", " sizes: List of sizes (0~1) of generated MultiBoxPriores. \n", " ratios: List of aspect ratios (non-negative) of generated MultiBoxPriores. \n", " Returns:\n", " anchors of shape (1, num_anchors, 4). 由于batch里每个都一样, 所以第一维为1\n", " \"\"\"\n", " pairs = [] # pair of (size, sqrt(ration))\n", " for r in ratios:\n", " pairs.append([sizes[0], math.sqrt(r)])\n", " for s in sizes[1:]:\n", " pairs.append([s, math.sqrt(ratios[0])])\n", " \n", " pairs = np.array(pairs)\n", " \n", " ss1 = pairs[:, 0] * pairs[:, 1] # size * sqrt(ration)\n", " ss2 = pairs[:, 0] / pairs[:, 1] # size / sqrt(ration)\n", " \n", " base_anchors = np.stack([-ss1, -ss2, ss1, ss2], axis=1) / 2\n", " \n", " h, w = feature_map.shape[-2:]\n", " shifts_x = np.arange(0, w) / w\n", " shifts_y = np.arange(0, h) / h\n", " shift_x, shift_y = np.meshgrid(shifts_x, shifts_y)\n", " shift_x = shift_x.reshape(-1)\n", " shift_y = shift_y.reshape(-1)\n", " shifts = np.stack((shift_x, shift_y, shift_x, shift_y), axis=1)\n", " \n", " anchors = shifts.reshape((-1, 1, 4)) + base_anchors.reshape((1, -1, 4))\n", " \n", " return torch.tensor(anchors, dtype=torch.float32).view(1, -1, 4)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "torch.Size([1, 2042040, 4])" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X = torch.Tensor(1, 3, h, w) # 构造输入数据\n", "Y = MultiBoxPrior(X, sizes=[0.75, 0.5, 0.25], ratios=[1, 2, 0.5])\n", "Y.shape" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "tensor([-0.0316, 0.0706, 0.7184, 0.8206])" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "boxes = Y.reshape((h, w, 5, 4))\n", "boxes[250, 250, 0, :]# * torch.tensor([w, h, w, h], dtype=torch.float32)\n", "# 第一个size和ratio分别为0.75和1, 则宽高均为0.75 = 0.7184 + 0.0316 = 0.8206 - 0.0706" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# 本函数已保存在dd2lzh_pytorch包中方便以后使用\n", "def show_bboxes(axes, bboxes, labels=None, colors=None):\n", " def _make_list(obj, default_values=None):\n", " if obj is None:\n", " obj = default_values\n", " elif not isinstance(obj, (list, tuple)):\n", " obj = [obj]\n", " return obj\n", "\n", " labels = _make_list(labels)\n", " colors = _make_list(colors, ['b', 'g', 'r', 'm', 'c'])\n", " for i, bbox in enumerate(bboxes):\n", " color = colors[i % len(colors)]\n", " rect = d2l.bbox_to_rect(bbox.detach().cpu().numpy(), color)\n", " axes.add_patch(rect)\n", " if labels and len(labels) > i:\n", " text_color = 'k' if color == 'w' else 'w'\n", " axes.text(rect.xy[0], rect.xy[1], labels[i],\n", " va='center', ha='center', fontsize=6, color=text_color,\n", " bbox=dict(facecolor=color, lw=0))" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "d2l.set_figsize()\n", "fig = d2l.plt.imshow(img)\n", "bbox_scale = torch.tensor([[w, h, w, h]], dtype=torch.float32)\n", "show_bboxes(fig.axes, boxes[250, 250, :, :] * bbox_scale,\n", " ['s=0.75, r=1', 's=0.75, r=2', 's=0.55, r=0.5', 's=0.5, r=1', 's=0.25, r=1'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 9.4.2 交并比\n", "代码来自https://github.com/sgrvinod/a-PyTorch-Tutorial-to-Object-Detection/blob/master/utils.py#L356" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# 以下函数已保存在d2lzh_pytorch包中方便以后使用\n", "def compute_intersection(set_1, set_2):\n", " \"\"\"\n", " 计算anchor之间的交集\n", " Args:\n", " set_1: a tensor of dimensions (n1, 4), anchor表示成(xmin, ymin, xmax, ymax)\n", " set_2: a tensor of dimensions (n2, 4), anchor表示成(xmin, ymin, xmax, ymax)\n", " Returns:\n", " intersection of each of the boxes in set 1 with respect to each of the boxes in set 2, shape: (n1, n2)\n", " \"\"\"\n", " # PyTorch auto-broadcasts singleton dimensions\n", " lower_bounds = torch.max(set_1[:, :2].unsqueeze(1), set_2[:, :2].unsqueeze(0)) # (n1, n2, 2)\n", " upper_bounds = torch.min(set_1[:, 2:].unsqueeze(1), set_2[:, 2:].unsqueeze(0)) # (n1, n2, 2)\n", " intersection_dims = torch.clamp(upper_bounds - lower_bounds, min=0) # (n1, n2, 2)\n", " return intersection_dims[:, :, 0] * intersection_dims[:, :, 1] # (n1, n2)\n", "\n", "\n", "def compute_jaccard(set_1, set_2):\n", " \"\"\"\n", " 计算anchor之间的Jaccard系数(IoU)\n", " Args:\n", " set_1: a tensor of dimensions (n1, 4), anchor表示成(xmin, ymin, xmax, ymax)\n", " set_2: a tensor of dimensions (n2, 4), anchor表示成(xmin, ymin, xmax, ymax)\n", " Returns:\n", " Jaccard Overlap of each of the boxes in set 1 with respect to each of the boxes in set 2, shape: (n1, n2)\n", " \"\"\"\n", " # Find intersections\n", " intersection = compute_intersection(set_1, set_2) # (n1, n2)\n", "\n", " # Find areas of each box in both sets\n", " areas_set_1 = (set_1[:, 2] - set_1[:, 0]) * (set_1[:, 3] - set_1[:, 1]) # (n1)\n", " areas_set_2 = (set_2[:, 2] - set_2[:, 0]) * (set_2[:, 3] - set_2[:, 1]) # (n2)\n", "\n", " # Find the union\n", " # PyTorch auto-broadcasts singleton dimensions\n", " union = areas_set_1.unsqueeze(1) + areas_set_2.unsqueeze(0) - intersection # (n1, n2)\n", "\n", " return intersection / union # (n1, n2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 9.4.3 标注训练集的锚框" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "bbox_scale = torch.tensor((w, h, w, h), dtype=torch.float32)\n", "ground_truth = torch.tensor([[0, 0.1, 0.08, 0.52, 0.92],\n", " [1, 0.55, 0.2, 0.9, 0.88]])\n", "anchors = torch.tensor([[0, 0.1, 0.2, 0.3], [0.15, 0.2, 0.4, 0.4],\n", " [0.63, 0.05, 0.88, 0.98], [0.66, 0.45, 0.8, 0.8],\n", " [0.57, 0.3, 0.92, 0.9]])\n", "\n", "fig = d2l.plt.imshow(img)\n", "show_bboxes(fig.axes, ground_truth[:, 1:] * bbox_scale, ['dog', 'cat'], 'k')\n", "show_bboxes(fig.axes, anchors * bbox_scale, ['0', '1', '2', '3', '4']);" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "tensor([[0.0536, 0.0000],\n", " [0.1417, 0.0000],\n", " [0.0000, 0.5657],\n", " [0.0000, 0.2059],\n", " [0.0000, 0.7459]])" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "compute_jaccard(anchors, ground_truth[:, 1:]) # 验证一下写的compute_jaccard函数" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "# 以下函数已保存在d2lzh_pytorch包中方便以后使用\n", "def assign_anchor(bb, anchor, jaccard_threshold=0.5):\n", " \"\"\"\n", " # 按照「9.4.1. 生成多个锚框」图9.3所讲为每个anchor分配真实的bb, anchor表示成归一化(xmin, ymin, xmax, ymax).\n", " https://zh.d2l.ai/chapter_computer-vision/anchor.html\n", " Args:\n", " bb: 真实边界框(bounding box), shape:(nb, 4)\n", " anchor: 待分配的anchor, shape:(na, 4)\n", " jaccard_threshold: 预先设定的阈值\n", " Returns:\n", " assigned_idx: shape: (na, ), 每个anchor分配的真实bb对应的索引, 若未分配任何bb则为-1\n", " \"\"\"\n", " na = anchor.shape[0]\n", " nb = bb.shape[0]\n", " jaccard = compute_jaccard(anchor, bb).detach().cpu().numpy() # shape: (na, nb)\n", " assigned_idx = np.ones(na) * -1 # 初始全为-1\n", " \n", " # 先为每个bb分配一个anchor(不要求满足jaccard_threshold)\n", " jaccard_cp = jaccard.copy()\n", " for j in range(nb):\n", " i = np.argmax(jaccard_cp[:, j])\n", " assigned_idx[i] = j\n", " jaccard_cp[i, :] = float(\"-inf\") # 赋值为负无穷, 相当于去掉这一行\n", " \n", " # 处理还未被分配的anchor, 要求满足jaccard_threshold\n", " for i in range(na):\n", " if assigned_idx[i] == -1:\n", " j = np.argmax(jaccard[i, :])\n", " if jaccard[i, j] >= jaccard_threshold:\n", " assigned_idx[i] = j\n", " \n", " return torch.tensor(assigned_idx, dtype=torch.long)\n", "\n", "\n", "def xy_to_cxcy(xy):\n", " \"\"\"\n", " 将(x_min, y_min, x_max, y_max)形式的anchor转换成(center_x, center_y, w, h)形式的.\n", " https://github.com/sgrvinod/a-PyTorch-Tutorial-to-Object-Detection/blob/master/utils.py\n", " Args:\n", " xy: bounding boxes in boundary coordinates, a tensor of size (n_boxes, 4)\n", " Returns: \n", " bounding boxes in center-size coordinates, a tensor of size (n_boxes, 4)\n", " \"\"\"\n", " return torch.cat([(xy[:, 2:] + xy[:, :2]) / 2, # c_x, c_y\n", " xy[:, 2:] - xy[:, :2]], 1) # w, h\n", "\n", "def MultiBoxTarget(anchor, label):\n", " \"\"\"\n", " # 按照「9.4.1. 生成多个锚框」所讲的实现, anchor表示成归一化(xmin, ymin, xmax, ymax).\n", " https://zh.d2l.ai/chapter_computer-vision/anchor.html\n", " Args:\n", " anchor: torch tensor, 输入的锚框, 一般是通过MultiBoxPrior生成, shape:(1,锚框总数,4)\n", " label: 真实标签, shape为(bn, 每张图片最多的真实锚框数, 5)\n", " 第二维中,如果给定图片没有这么多锚框, 可以先用-1填充空白, 最后一维中的元素为[类别标签, 四个坐标值]\n", " Returns:\n", " 列表, [bbox_offset, bbox_mask, cls_labels]\n", " bbox_offset: 每个锚框的标注偏移量,形状为(bn,锚框总数*4)\n", " bbox_mask: 形状同bbox_offset, 每个锚框的掩码, 一一对应上面的偏移量, 负类锚框(背景)对应的掩码均为0, 正类锚框的掩码均为1\n", " cls_labels: 每个锚框的标注类别, 其中0表示为背景, 形状为(bn,锚框总数)\n", " \"\"\"\n", " assert len(anchor.shape) == 3 and len(label.shape) == 3\n", " bn = label.shape[0]\n", " \n", " def MultiBoxTarget_one(anc, lab, eps=1e-6):\n", " \"\"\"\n", " MultiBoxTarget函数的辅助函数, 处理batch中的一个\n", " Args:\n", " anc: shape of (锚框总数, 4)\n", " lab: shape of (真实锚框数, 5), 5代表[类别标签, 四个坐标值]\n", " eps: 一个极小值, 防止log0\n", " Returns:\n", " offset: (锚框总数*4, )\n", " bbox_mask: (锚框总数*4, ), 0代表背景, 1代表非背景\n", " cls_labels: (锚框总数, 4), 0代表背景\n", " \"\"\"\n", " an = anc.shape[0]\n", " assigned_idx = assign_anchor(lab[:, 1:], anc) # (锚框总数, )\n", " bbox_mask = ((assigned_idx >= 0).float().unsqueeze(-1)).repeat(1, 4) # (锚框总数, 4)\n", "\n", " cls_labels = torch.zeros(an, dtype=torch.long) # 0表示背景\n", " assigned_bb = torch.zeros((an, 4), dtype=torch.float32) # 所有anchor对应的bb坐标\n", " for i in range(an):\n", " bb_idx = assigned_idx[i]\n", " if bb_idx >= 0: # 即非背景\n", " cls_labels[i] = lab[bb_idx, 0].long().item() + 1 # 注意要加一\n", " assigned_bb[i, :] = lab[bb_idx, 1:]\n", "\n", " center_anc = xy_to_cxcy(anc) # (center_x, center_y, w, h)\n", " center_assigned_bb = xy_to_cxcy(assigned_bb)\n", "\n", " offset_xy = 10.0 * (center_assigned_bb[:, :2] - center_anc[:, :2]) / center_anc[:, 2:]\n", " offset_wh = 5.0 * torch.log(eps + center_assigned_bb[:, 2:] / center_anc[:, 2:])\n", " offset = torch.cat([offset_xy, offset_wh], dim = 1) * bbox_mask # (锚框总数, 4)\n", "\n", " return offset.view(-1), bbox_mask.view(-1), cls_labels\n", " \n", " batch_offset = []\n", " batch_mask = []\n", " batch_cls_labels = []\n", " for b in range(bn):\n", " offset, bbox_mask, cls_labels = MultiBoxTarget_one(anchor[0, :, :], label[b, :, :])\n", " \n", " batch_offset.append(offset)\n", " batch_mask.append(bbox_mask)\n", " batch_cls_labels.append(cls_labels)\n", " \n", " bbox_offset = torch.stack(batch_offset)\n", " bbox_mask = torch.stack(batch_mask)\n", " cls_labels = torch.stack(batch_cls_labels)\n", " \n", " return [bbox_offset, bbox_mask, cls_labels]" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "labels = MultiBoxTarget(anchors.unsqueeze(dim=0),\n", " ground_truth.unsqueeze(dim=0))" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "tensor([[0, 1, 2, 0, 2]])" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "labels[2]" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "tensor([[0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 1., 1.,\n", " 1., 1.]])" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "labels[1]" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "tensor([[-0.0000e+00, -0.0000e+00, -0.0000e+00, -0.0000e+00, 1.4000e+00,\n", " 1.0000e+01, 2.5940e+00, 7.1754e+00, -1.2000e+00, 2.6882e-01,\n", " 1.6824e+00, -1.5655e+00, -0.0000e+00, -0.0000e+00, -0.0000e+00,\n", " -0.0000e+00, -5.7143e-01, -1.0000e+00, 4.1723e-06, 6.2582e-01]])" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "labels[0]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 9.4.4. 输出预测边界框" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "anchors = torch.tensor([[0.1, 0.08, 0.52, 0.92], [0.08, 0.2, 0.56, 0.95],\n", " [0.15, 0.3, 0.62, 0.91], [0.55, 0.2, 0.9, 0.88]])\n", "offset_preds = torch.tensor([0.0] * (4 * len(anchors)))\n", "cls_probs = torch.tensor([[0., 0., 0., 0.,], # 背景的预测概率\n", " [0.9, 0.8, 0.7, 0.1], # 狗的预测概率\n", " [0.1, 0.2, 0.3, 0.9]]) # 猫的预测概率" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "fig = d2l.plt.imshow(img)\n", "show_bboxes(fig.axes, anchors * bbox_scale,\n", " ['dog=0.9', 'dog=0.8', 'dog=0.7', 'cat=0.9'])" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "# 以下函数已保存在d2lzh_pytorch包中方便以后使用\n", "from collections import namedtuple\n", "Pred_BB_Info = namedtuple(\"Pred_BB_Info\", [\"index\", \"class_id\", \"confidence\", \"xyxy\"])\n", "\n", "def non_max_suppression(bb_info_list, nms_threshold = 0.5):\n", " \"\"\"\n", " 非极大抑制处理预测的边界框\n", " Args:\n", " bb_info_list: Pred_BB_Info的列表, 包含预测类别、置信度等信息\n", " nms_threshold: 阈值\n", " Returns:\n", " output: Pred_BB_Info的列表, 只保留过滤后的边界框信息\n", " \"\"\"\n", " output = []\n", " # 先根据置信度从高到低排序\n", " sorted_bb_info_list = sorted(bb_info_list, key = lambda x: x.confidence, reverse=True)\n", "\n", " while len(sorted_bb_info_list) != 0:\n", " best = sorted_bb_info_list.pop(0)\n", " output.append(best)\n", " \n", " if len(sorted_bb_info_list) == 0:\n", " break\n", "\n", " bb_xyxy = []\n", " for bb in sorted_bb_info_list:\n", " bb_xyxy.append(bb.xyxy)\n", " \n", " iou = compute_jaccard(torch.tensor([best.xyxy]), \n", " torch.tensor(bb_xyxy))[0] # shape: (len(sorted_bb_info_list), )\n", " \n", " n = len(sorted_bb_info_list)\n", " sorted_bb_info_list = [sorted_bb_info_list[i] for i in range(n) if iou[i] <= nms_threshold]\n", " return output\n", "\n", "def MultiBoxDetection(cls_prob, loc_pred, anchor, nms_threshold = 0.5):\n", " \"\"\"\n", " # 按照「9.4.1. 生成多个锚框」所讲的实现, anchor表示成归一化(xmin, ymin, xmax, ymax).\n", " https://zh.d2l.ai/chapter_computer-vision/anchor.html\n", " Args:\n", " cls_prob: 经过softmax后得到的各个锚框的预测概率, shape:(bn, 预测总类别数+1, 锚框个数)\n", " loc_pred: 预测的各个锚框的偏移量, shape:(bn, 锚框个数*4)\n", " anchor: MultiBoxPrior输出的默认锚框, shape: (1, 锚框个数, 4)\n", " nms_threshold: 非极大抑制中的阈值\n", " Returns:\n", " 所有锚框的信息, shape: (bn, 锚框个数, 6)\n", " 每个锚框信息由[class_id, confidence, xmin, ymin, xmax, ymax]表示\n", " class_id=-1 表示背景或在非极大值抑制中被移除了\n", " \"\"\"\n", " assert len(cls_prob.shape) == 3 and len(loc_pred.shape) == 2 and len(anchor.shape) == 3\n", " bn = cls_prob.shape[0]\n", " \n", " def MultiBoxDetection_one(c_p, l_p, anc, nms_threshold = 0.5):\n", " \"\"\"\n", " MultiBoxDetection的辅助函数, 处理batch中的一个\n", " Args:\n", " c_p: (预测总类别数+1, 锚框个数)\n", " l_p: (锚框个数*4, )\n", " anc: (锚框个数, 4)\n", " nms_threshold: 非极大抑制中的阈值\n", " Return:\n", " output: (锚框个数, 6)\n", " \"\"\"\n", " pred_bb_num = c_p.shape[1]\n", " anc = (anc + l_p.view(pred_bb_num, 4)).detach().cpu().numpy() # 加上偏移量\n", " \n", " confidence, class_id = torch.max(c_p, 0)\n", " confidence = confidence.detach().cpu().numpy()\n", " class_id = class_id.detach().cpu().numpy()\n", " \n", " pred_bb_info = [Pred_BB_Info(\n", " index = i,\n", " class_id = class_id[i] - 1, # 正类label从0开始\n", " confidence = confidence[i],\n", " xyxy=[*anc[i]]) # xyxy是个列表\n", " for i in range(pred_bb_num)]\n", " \n", " # 正类的index\n", " obj_bb_idx = [bb.index for bb in non_max_suppression(pred_bb_info, nms_threshold)]\n", " \n", " output = []\n", " for bb in pred_bb_info:\n", " output.append([\n", " (bb.class_id if bb.index in obj_bb_idx else -1.0),\n", " bb.confidence,\n", " *bb.xyxy\n", " ])\n", " \n", " return torch.tensor(output) # shape: (锚框个数, 6)\n", " \n", " batch_output = []\n", " for b in range(bn):\n", " batch_output.append(MultiBoxDetection_one(cls_prob[b], loc_pred[b], anchor[0], nms_threshold))\n", " \n", " return torch.stack(batch_output)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "tensor([[[ 0.0000, 0.9000, 0.1000, 0.0800, 0.5200, 0.9200],\n", " [-1.0000, 0.8000, 0.0800, 0.2000, 0.5600, 0.9500],\n", " [-1.0000, 0.7000, 0.1500, 0.3000, 0.6200, 0.9100],\n", " [ 1.0000, 0.9000, 0.5500, 0.2000, 0.9000, 0.8800]]])" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "output = MultiBoxDetection(\n", " cls_probs.unsqueeze(dim=0), offset_preds.unsqueeze(dim=0),\n", " anchors.unsqueeze(dim=0), nms_threshold=0.5)\n", "output" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "fig = d2l.plt.imshow(img)\n", "for i in output[0].detach().cpu().numpy():\n", " if i[0] == -1:\n", " continue\n", " label = ('dog=', 'cat=')[int(i[0])] + str(i[1])\n", " show_bboxes(fig.axes, [torch.tensor(i[2:]) * bbox_scale], label)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python [conda env:py36]", "language": "python", "name": "conda-env-py36-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.2" } }, "nbformat": 4, "nbformat_minor": 2 }