本文已参与「新人创作礼」活动，一起开启掘金创作之路。

首先导入所需要的工具包

import numpy as np
import torchvision
import torch
import torchvision.transforms as transforms
from PIL import Image, ImageDraw
import matplotlib.pyplot as plt
import os
import xml.etree.ElementTree as ET

1. 目标检测和边界框

# 测试图像
# imagepath = 'E:\学习\机器学习\数据集\VOC2012\VOCdevkit\VOC2012\JPEGImages\\2007_001423.jpg'
imagepath = 'E:\学习\机器学习\数据集\VOC2012\VOCdevkit\VOC2012\JPEGImages\\2007_001526.jpg'


# 读取并显示图像
image = plt.imread(imagepath)
plt.imshow(image)

<matplotlib.image.AxesImage at 0x22666b24208>

对于这张带标注的图像，可以从对于的xml文件中获取其坐标信息，脚本如下

# 功能：输入图像路径，在给定鲁中的xml文件夹中寻找，获取标志位置信息并返回
def image_to_boxes(imagepath):
    
    xmlpath = 'E:\学习\机器学习\数据集\VOC2012\VOCdevkit\VOC2012\Annotations'
    
    # 根据文件名提取出xml文件路径
    imagename = imagepath.split('\\')[-1].split('.')[0]
    xmlpath = os.path.join(xmlpath, imagename + '.xml')
# print(xmlpath)
    
    # 获取xml文件的对象信息
    root = ET.parse(xmlpath).getroot()
    objects = root.findall('object')
    
    # 遍历全部的对象
    boxes = []
    for obj in objects:
        
        # 获取对象的左上角与右下角坐标
        bbox = obj.find('bndbox')
        xmin = int(float(bbox.find('xmin').text.strip()))
        ymin = int(float(bbox.find('ymin').text.strip()))
        xmax = int(float(bbox.find('xmax').text.strip()))
        ymax = int(float(bbox.find('ymax').text.strip()))
        
        # 追加信息保存
        boxes.append([xmin, ymin, xmax, ymax])
    
    # 返回对象坐标位置列表
    return torch.tensor(boxes)

测试

boxes = image_to_boxes(imagepath)
boxes

tensor([[ 3, 18, 114, 298],
        [109, 51, 214, 297],
        [212, 35, 316, 297],
        [289, 33, 387, 297],
        [381, 15, 500, 297]])

现在可以从图像中获取到坐标信息，边界框是矩形的，由矩形左上角的 x 和 y 坐标以及右下角的坐标决定。另一种常用的边界框表示方法是边界框中心的 (x,y) 轴坐标以及框的宽度和高度。

# 将左上，右下）转换到（中间，宽度，高度）
def box_corner_to_center(boxes):
    
    # 将列表数据转换为tensor格式
    boxes = torch.tensor(boxes)
    
    # 获取左上角与右下角坐标列表
    x1, y1, x2, y2 = boxes[:, 0], boxes[:, 1], boxes[:, 2], boxes[:, 3]
    
    # 列表的数据处理
    cx = (x1 + x2) / 2
    cy = (y1 + y2) / 2
    w = x2 - x1
    h = y2 - y1
    
    # 拼接处理，由于是tensor，所以使用torch的stack函数
    # axis=0 时会进行数据的行拼接
    # axis=-1 时会进行数据的列拼接
    boxes = torch.stack((cx, cy, w, h), axis=-1)
    
    return boxes


boxes = box_corner_to_center(boxes)
boxes

E:\anacanda\envs\project\lib\site-packages\ipykernel_launcher.py:5: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
  """ tensor([[ 0.3295, 0.5800, -0.0630, 0.0600], [ 0.3173, 0.5640, -0.0445, 0.0120], [ 0.2975, 0.4858, -0.0310, -0.0815], [ 0.2350, 0.3733, -0.2060, -0.1965]])

# 从（中间，宽度，高度）转换到（左上，右下）
def box_center_to_corner(boxes):
   
    # 将列表数据转换为atensor格式
    boxes = torch.tensor(boxes)

    # 获取中心坐标已经宽高值
    cx, cy, w, h = boxes[:, 0], boxes[:, 1], boxes[:, 2], boxes[:, 3]
    
    # 列表的数据处理
    x1 = cx - 0.5 * w
    y1 = cy - 0.5 * h
    x2 = cx + 0.5 * w
    y2 = cy + 0.5 * h
    
    # 拼接处理，由于是tensor，所以使用torch的stack函数
    # axis=0 时会进行数据的行拼接
    # axis=-1 时会进行数据的列拼接
    boxes = torch.stack((x1, y1, x2, y2), axis=-1)
    
    return boxes

boxes = box_center_to_corner(boxes)
boxes

E:\anacanda\envs\project\lib\site-packages\ipykernel_launcher.py:5: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
  """ tensor([[0.3610, 0.5500, 0.2980, 0.6100], [0.3395, 0.5580, 0.2950, 0.5700], [0.3130, 0.5265, 0.2820, 0.4450], [0.3380, 0.4715, 0.1320, 0.2750]])

可以看见，经过两次的转换，数据会变回原来的格式

box_center_to_corner(box_corner_to_center(boxes)) == boxes

E:\anacanda\envs\project\lib\site-packages\ipykernel_launcher.py:5: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
  """ tensor([[True, True, True, True], [True, True, True, True], [True, True, True, True], [True, True, True, True], [True, True, True, True]])

现在尝试在原图上根据这些坐标绘制出边界框。这里定义一个辅助函数 bbox_to_rect。它将边界框表示成 matplotlib 的边界框格式

# 将边界框 (左上x, 左上y, 右下x, 右下y) 格式转换成 matplotlib 格式：
# ((左上x, 左上y), 宽, 高)
def bbox_to_rect(bbox, color, linewidth=2):
    
    # 注意，这里输入的是单个边界框
    xy = (bbox[0], bbox[1])   # 左上角坐标
    width = bbox[2]-bbox[0]   # 右下角的x坐标 - 左上角的x坐标
    height = bbox[3]-bbox[1]  # 右下角的y坐标 - 左上角的y坐标
    
    # 返回matplotlib 的边界框格式
    # fill=False: 取消填充功能，否则不是边界框而是一个色块
    # edgecolor： 边界框颜色
    # linewidth: 边界框的宽度
    return plt.Rectangle(xy, width, height, fill=False, edgecolor=color, linewidth=linewidth)

在原图上绘制边界框

color = ['blue','red','yellow','gray','pink']

# 显示图片
img = plt.imshow(image)
# 取消坐标轴
plt.axis('off')

# 不同的边界框使用不同的颜色框住
for index, bbox in enumerate(boxes):
    # 以添加补丁的方式在原图上绘制边界框
    img.axes.add_patch(bbox_to_rect(bbox, color[index], linewidth=3))

2. 锚框

torch.set_printoptions(2)  # 精简打印精度

要生成多个不同形状的锚框，让我们设置一系列刻度 s1,…,sn 和一系列宽高比 r1,…,rm 。当使用这些比例和长宽比的所有组合以每个像素为中心时，输入图像将总共有 whnm 个锚框。

尽管这些锚框可能会覆盖所有地面真实边界框，但计算复杂性很容易过高。在实践中，我们只考虑包含 s1 或 r1 的组合：

(s1,r1),(s1,r2),…,(s1,rm),(s2,r1),(s3,r1),…,(sn,r1)

# 功能： 指定输入图像、尺度列表和宽高比列表，然后此函数将生成以每个像素为中心具有不同形状的锚框，返回所有的锚框
def multibox_prior(data, sizes, ratios):

    device = data.device
    size_tensor = torch.tensor(sizes, device=device)
    ratio_tensor = torch.tensor(ratios, device=device)

    print(data.shape)
    # 获取图像宽高
    img_height, img_width = data.shape[-2:]

    # 避免anchor太密集，只挑选特定的boxes
    boxes_per_pixel = len(sizes) + len(ratios) - 1

    # 获取每个像素的中心点
    steps_h = 1.0 / img_height  # 高度步长
    steps_w = 1.0 / img_width   # 宽度步长

    # 根据图像像素点位置 * 步长 来实现归一化处理，使得图像尺寸计算为1
    # 0.5 指的是像素点中心位置的偏移量
    center_h = (torch.arange(img_height, device=device) + 0.5) * steps_h
    center_w = (torch.arange(img_width, device=device) + 0.5) * steps_w
    # print(center_h.shape, center_w.shape) # torch.Size([333]) torch.Size([500])

    # 根据步长位置构建每个像素点的坐标信息
    shift_y, shift_x = torch.meshgrid(center_h, center_w)
    # print(shift_y.shape, shift_x.shape) # torch.Size([333, 500]) torch.Size([333, 500])

    # 分别转换成列表，方便拼接，其中(shift_x, shift_y)就代表了图像中全部像素点的中心坐标
    shift_y, shift_x = shift_y.reshape(-1), shift_x.reshape(-1)

    # 现在对(shift_x, shift_y)进行拼接，方便一会转换成左上角与右下角的坐标格式，所以需要设置两组坐标
    # 其中参数dim=1表示的是对列进行拼接
    center_point = torch.stack([shift_x, shift_y, shift_x, shift_y], dim=1)
    # 由于每个像素点会生成(n+m−1)个anchor，所以需要对坐标列表重复5次
    # repeat_interleave函数是对每一行分别进行先复制; repeat函数是对每一块分别进行复制
    center_point = center_point.repeat_interleave(boxes_per_pixel, dim=0)
    # print(center_point)

    # 现在构造出了中心点坐标，接着需要构造偏移信息列表，使中心坐标+偏移量就转换成转换成左上角与右下角的坐标格式
    # 其中: anchor_w = s * sqrt（w * h * r） anchor_h = s * sqrt（w * h / r）
    # 这样使得 anchor_w / anchor_h = r anchor_w * anchor_h = (ws)*(hs)
    # anchor_w = torch.cat((size_tensor * torch.sqrt(ratio_tensor[0]), size_tensor[0] * torch.sqrt(ratio_tensor[1:]))) \
    # * math.sqrt(img_width * img_height)
    # anchor_h = torch.cat((size_tensor / torch.sqrt(ratio_tensor[0]), size_tensor[0] / torch.sqrt(ratio_tensor[1:]))) \
    # * math.sqrt(img_width * img_height)
    # anchor_w, anchor_h:
    # tensor([306.0331, 204.0221, 102.0110, 432.7961, 216.3981])
    # tensor([306.0331, 204.0221, 102.0110, 216.3981, 432.7962])
    # 现在得到的5个anchor是在图像上的像素大小，需要同样对其进行归一化操作

    # 而另一种方法是:
    # 其中size值的是相比原图的大小, ratio值的宽高比
    anchor_w = torch.cat((size_tensor * torch.sqrt(ratio_tensor[0]), size_tensor[0] * torch.sqrt(ratio_tensor[1:])))   \
                * img_height / img_width  # 由于图像一般是矩形的，为了显示出是正方形，这里需要对宽度做一个缩放因子
    anchor_h = torch.cat((size_tensor / torch.sqrt(ratio_tensor[0]), size_tensor[0] / torch.sqrt(ratio_tensor[1:])))
    # anchor_w, anchor_h:
    # tensor([0.4995, 0.3330, 0.1665, 0.7064, 0.3532])
    # tensor([0.7500, 0.5000, 0.2500, 0.5303, 1.0607])
    print(anchor_w)
    print(anchor_h)

    # 获得偏移量
    anchor_offset = torch.stack((-anchor_w, -anchor_h, anchor_w, anchor_h))
    anchor_offset = anchor_offset.T.repeat(img_height * img_width, 1) / 2    # 先转置再按偏移块来重复

    # 更加中心点坐标与偏移量，获取anchor
    anchors = center_point + anchor_offset

    return anchors.unsqueeze(0)

为了显示以图像中一个像素为中心的所有锚框，我们定义了以下 show_bboxes 函数来在图像上绘制多个边界框

# 功能: 显示一个像素点上的所有边界框(这里设置了一个像素点上会有5个anchor)
def show_bboxes(axes, bboxes, labels=None, colors=None):

    # 如果没有传入颜色设置，这里会进行颜色一个初始化设置
    if colors is None:
        colors = ['blue', 'red', 'green', 'gray', 'pink']

    # 如果没有传入标签设置，这里会进行标签一个初始化设置
    if labels is None:
        labels = [i for i in range(len(bboxes))]

    # print(labels)
    # 以增加补丁的方式在原图上绘制矩形框
    for i, bbox in enumerate(bboxes):
        color = colors[i % len(colors)]
        rect = bbox_to_rect(bbox, color)  # 循环采用列表中的5种颜色
        # 增加矩形框补丁
        axes.add_patch(rect)
        # 增加文本补丁
        axes.text(rect.xy[0], rect.xy[1], labels[i], fontsize=20, color='white',
                  va='center', ha='center', bbox=dict(facecolor=color, edgecolor="black"))

测试图像，以下是测试代码，尝试生成图像上所以像素点的anchor框，然后挑选其中一个像素点对其的5个anchor进行绘制出来

# imagepath = 'E:\学习\机器学习\数据集\VOC2012\VOCdevkit\VOC2012\JPEGImages\\2007_001423.jpg'
imagepath = 'E:\学习\机器学习\数据集\VOC2012\VOCdevkit\VOC2012\JPEGImages\\2007_001526.jpg'

image = plt.imread(imagepath)
# print("image.shape:{}".format(image.shape))

h, w = image.shape[:2]
# print(h, w)
X = torch.rand(size=(1, 3, h, w))

# 设置大小比例以及宽高比(这里的sizes指是原图的一个比例大小)
sizes = [0.75, 0.5, 0.25]
ratios = [1, 2, 0.5]

# 另一种设置anchor的方法是指定不同anchor的面积
Y = multibox_prior(X, sizes, ratios)
boxes = Y.reshape(h, w, 5, 4)
print("boxes.shape:{}".format(boxes.shape))
# print(Y)

# 显示某一像素点为中心的所有anchor
print(boxes[250, 250, :, :])

# 显示边界框
fig = plt.imshow(image)
# 对于边界框的尺寸是归一化后的结果, 需要乘上原数值
bbox_scale = torch.tensor((w, h, w, h))
show_bboxes(fig.axes, boxes[250, 250, :, :] * bbox_scale)

torch.Size([1, 3, 298, 500])
tensor([0.4470, 0.2980, 0.1490, 0.6322, 0.3161])
tensor([0.7500, 0.5000, 0.2500, 0.5303, 1.0607])
boxes.shape:torch.Size([298, 500, 5, 4])
tensor([[0.2775, 0.4656, 0.7245, 1.2156],
        [0.3520, 0.5906, 0.6500, 1.0906],
        [0.4265, 0.7156, 0.5755, 0.9656],
        [0.1849, 0.5754, 0.8171, 1.1058],
        [0.3430, 0.3103, 0.6590, 1.3709]])

3. 交并比

接下来使用交并比来衡量锚框和真实边界框之间、以及不同锚框之间的相似度

先可以查看上面这幅图像的标注信息，由于这次的测试图像有5个人像，所以返回的标注信息也是5个

# imagepath = 'E:\学习\机器学习\数据集\VOC2012\VOCdevkit\VOC2012\JPEGImages\\2007_001423.jpg'
imagepath = 'E:\学习\机器学习\数据集\VOC2012\VOCdevkit\VOC2012\JPEGImages\\2007_001526.jpg'
gt_boxes = image_to_boxes(imagepath)
gt_boxes

[[3, 18, 114, 298],
 [109, 51, 214, 297],
 [212, 35, 316, 297],
 [289, 33, 387, 297],
 [381, 15, 500, 297]]

以刚刚所测试的边界框为示例，计算两个框之间的交并比，不过这里需要对边界框进行转换

# 查看转换后的边界框的真实坐标
pd_boxes = boxes[250, 250, :, :] * bbox_scale
pd_boxes

tensor([[138.7500, 138.7500, 362.2500, 362.2500],
        [176.0000, 176.0000, 325.0000, 325.0000],
        [213.2500, 213.2500, 287.7500, 287.7500],
        [ 92.4617, 171.4808, 408.5384, 329.5192],
        [171.4808, 92.4616, 329.5192, 408.5384]])

下面定义一个函数可以计算anchor(pd_boxes)与真实边界框(gt_boxes)之间的交并比

# 功能：计算两个锚框或边界框列表中成对的交并比
# 返回：返回的是每个真实框对每个预测边界框的交并比，eg：5个object，预测5个anchor，返回5x5的矩阵
def box_iou(boxes1, boxes2):
    
    # 如果不是tensor类型，需要转变为tensor类型
    if not isinstance(boxes1, torch.Tensor):
        boxes1 = torch.tensor(boxes1)
    if not isinstance(boxes2, torch.Tensor):
        boxes2 = torch.tensor(boxes2)
    
    # 利用左上角坐标与右下角坐标计算box面积
    box_area = lambda boxes: ((boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]))
    
    # 分别得到两组边界框的面积
    areas1 = box_area(boxes1)  # 真实边界框的面积: torch.Size([5, 1])
    areas2 = box_area(boxes2)  # 预测边界框的面积: torch.Size([5])
    
    # 找到交叉框的左上角点，每个gt_box都需要与5个pred anchor配对计算Iou： torch.Size([5, 5, 2])
    # 其中，这里与下面的[:, None, :2]中的None起到的升维的作用，比较精妙
    inter_upperlefts = torch.max(boxes1[:, None, :2], boxes2[:, :2])
# print(inter_upperlefts, inter_upperlefts.shape)
    
    # 找到交叉框的右下角点，每个gt_box都需要与5个pred anchor配对计算Iou： torch.Size([5, 5, 2])
    inter_lowerrights = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])
# print(inter_lowerrights, inter_lowerrights.shape)
    
    # 用右下角坐标 - 左上角坐标
    # 其中如果出现了负数，表面两个边界框没有交集，也就是交集面积为0，此时使用clamp函数限定最小值为0
    inters = (inter_lowerrights - inter_upperlefts).clamp(min=0)
# print(inters, inters.shape)
    
    # 其中inters存储着交集的宽高，相乘即为面积大小: torch.Size([5, 5])
    inter_areas = inters[:, :, 0] * inters[:, :, 1]    
# print(inter_areas, inter_areas.shape)
    
    # 每个gtbox面积分别加上预测ahnchor面积 - 交集面积，为每个gtbox对anchor的并集面积: torch.Size([5, 5])
    # 这里的None，将一维[5]的数据，升维成[5:1]的数据结构
    union_areas = areas1[:, None] + areas2 - inter_areas    
# print(union_areas, union_areas.shape)
    
    # 其中Iou就为 并集 / 交集
    return inter_areas / union_areas

下面计算上面所使用的的黑人图像与在[250, 250]像素点所预测的5个anchor之间的交并比

iou = box_iou(gt_boxes, pd_boxes)
iou, iou.shape

(tensor([[0.0000, 0.0000, 0.0000, 0.0348, 0.0000],
         [0.1864, 0.1059, 0.0018, 0.2105, 0.1296],
         [0.2709, 0.3414, 0.2037, 0.2035, 0.3803],
         [0.1805, 0.0996, 0.0000, 0.1936, 0.1227],
         [0.0000, 0.0000, 0.0000, 0.0432, 0.0000]]),
 torch.Size([5, 5]))

对应着之前在250×250这个生成的像素点的图像观察

可以比较清楚的看见，对于第一个黑人的图像，其真实的标注框与0,1,2,4四个生成的anchor都没有联系，只与第3个anchor有少量的交集，可以看见，函数计算出来的交并比为[0.0000, 0.0000, 0.0000, 0.0348, 0.0000]，这是符合我们所观察到的数据的。也就是说，对于第一个真实标注框，其与250×250该像素点生成的5个anchor的交并比分别为：0.0000, 0.0000, 0.0000, 0.0348, 0.0000，其余的类似

至此，接下来需要对anchor进行训练。而训练需要对anchor设定类别与偏移量

4. 将真实边界框分配给锚框

给定图像，假设锚框是 A1,A2,…,Ana ，真实边界框是 B1,B2,…,Bnb ，其中 na≥nb 。让我们定义一个矩阵 X∈Rna×nb ，其中 ith 行和 jth 列中的元素 xij 是锚框 Ai 和真实边界框 Bj 的 IoU 。该算法包含以下步骤：

1）在矩阵 X 中找到最大的元素，并将它的行索引和列索引分别表示为 i1 和 j1 。然后将真实边界框 Bj1 分配给锚框 Ai1 。这很直观，因为 Ai1 和 Bj1 是所有锚框和真实边界框配对中最相近的。在第一个分配完成后，丢弃矩阵中 i1th 行和 j1th 列中的所有元素。

2）在矩阵 X 中找到剩余元素中最大的元素，并将它的行索引和列索引分别表示为 i2 和 j2 。我们将真实边界框 Bj2 分配给锚框 Ai2 ，并丢弃矩阵中 i2th 行和 j2th 列中的所有元素。

3）此时，矩阵 X 中两行和两列中的元素已被丢弃。我们继续，直到丢弃掉矩阵 X 中 nb 列中的所有元素。此时，我们已经为这 nb 个锚框各自分配了一个真实边界框。

4）只遍历剩下的 na−nb 个锚框。例如，给定任何锚框 Ai ，在矩阵 X 的第 ith 行中找到与 Ai 的IoU最大的真实边界框 Bj ，只有当此 IoU 大于预定义的阈值时，才将 Bj 分配给 Ai 。

用一个实际的例子说明以上过程：

让我们用一个具体的例子来说明上述算法。如图13.4.2（左）所示，假设矩阵 X 中的最大值为 x23 ，我们将真实边界框 B3 分配给锚框 A2 。然后，我们丢弃矩阵第 2 行和第 3 列中的所有元素，在剩余元素（阴影区域）中找到最大的 x71 ，然后将真实边界框 B1 分配给锚框 A7 。接下来，如图13.4.2（中）所示，丢弃矩阵第 7 行和第 1 列中的所有元素，在剩余元素（阴影区域）中找到最大的 x54 ，然后将真实边界框 B4 分配给锚框 A5 。最后，如图13.4.2（右）所示，丢弃矩阵第 5 行和第 4 列中的所有元素，在剩余元素（阴影区域）中找到最大的 x92 ，然后将真实边界框 B2 分配给锚框 A9 。之后，我们只需要遍历剩余的锚框 A1,A3,A4,A6,A8 ，然后根据阈值确定是否为它们分配真实边界框。

重新回顾一下真实边界框

imagepath = 'E:\学习\机器学习\数据集\VOC2012\VOCdevkit\VOC2012\JPEGImages\\2007_001526.jpg'
ground_truth = image_to_boxes(imagepath)
ground_truth

[[3, 18, 114, 298],
 [109, 51, 214, 297],
 [212, 35, 316, 297],
 [289, 33, 387, 297],
 [381, 15, 500, 297]]

显示150×150与150×300这两个像素点的anchor，每个像素点5个框，也就是一共有10个框。现在假设使用150×150与150×300这两个像素点的anchor对真实标注框进行预测，现在尝试将5个真实边界框分配给这10个anchor

# 显示边界框
# color = ['blue','red','yellow','gray','pink']
fig = plt.imshow(image)
bbox_scale = torch.tensor((w, h, w, h))
label = [i for i in range(10)]
show_bboxes(fig.axes, boxes[150, 150, :, :] * bbox_scale, labels=label[:5])
show_bboxes(fig.axes, boxes[150, 300, :, :] * bbox_scale, labels=label[5:])
for index, bbox in enumerate(gt_boxes):
    # 以添加补丁的方式在原图上绘制边界框
    fig.axes.add_patch(bbox_to_rect(bbox, 'black', linewidth=3))

# 功能: 传入gt_box信息与anchor信息, 将最接近的真实边界框分配给锚框
# 算法思想: 循环遍历ground_truth次, 每次找到全局中iou值最大的索引然后剔除行列数据, 重复操作, 为每一个ground_truth都找到一个anchor
def assign_anchor_to_bbox(ground_truth, anchors, device, iou_threshold=0.5):

    # 这里的ground_truth其实就是label标签
    num_anchors, num_gt_boxes = anchors.shape[0], ground_truth.shape[0]

    # 构建iou列表, 列标签为anchor, 行标签为ground_truth
    jaccard = box_iou(anchors, ground_truth)

    # 对于每个锚框，分配的真实边界框的张量
    anchors_bbox_map = torch.full((num_anchors,), -1, dtype=torch.long, device=device)

    # 赋予anchor与每个ground_truth的iou值的最大值, 并保存iou最大的ground_truth索引
    max_ious, indices = torch.max(jaccard, dim=1)

    # 根据阈值，决定是否分配真实边界框; 且获取对应索引
    anc_i = torch.nonzero(max_ious >= 0.5).reshape(-1)  # 获取高于阈值的索引
    box_j = indices[max_ious >= 0.5]     # 获取类别标签
    anchors_bbox_map[anc_i] = box_j      # 在anchor分配表中对高于阈值的anchor进行分配

    # 行列清除表
    col_discard = torch.full((num_anchors,), -1)
    row_discard = torch.full((num_gt_boxes,), -1)

    # 每次提取表格中最大的iou值, 剔除行列数据再进行下一次循环
    # 为每个ground_truth都找到对应iou值最大的anchor
    for _ in range(num_gt_boxes):
        # 由于没有设置dim, 这里返回的是全局最大的索引值(相当于将jaccard打平成一维的来计算)
        max_idx = torch.argmax(jaccard)
        box_idx = (max_idx % num_gt_boxes).long()   # iou值最大ground_truth的索引
        anc_idx = (max_idx / num_gt_boxes).long()   # iou值最大anchor的索引
        anchors_bbox_map[anc_idx] = box_idx     # 对应赋值
        jaccard[:, box_idx] = col_discard       # 清除所在列的全部数据
        jaccard[anc_idx, :] = row_discard       # 清除所在行的全部数据

    return anchors_bbox_map


# 功能: 传入anchor与分配完的ground_truth信息, 对锚框偏移量的转换
def offset_boxes(anchors, assigned_bb, eps=1e-6):

    # 将(左上，右下）转换到（中间，宽度，高度）形式
    c_anc = box_corner_to_center(anchors)
    c_assigned_bb = box_corner_to_center(assigned_bb)

    # [:, :2]表示xy坐标, [:, 2:]表示宽高, 根据公式进行设置
    # 将xy坐标分别相减再对应的除以宽高, 10是因为σx=σy=0.1, 5是因为σw=σh=0.2
    offset_xy = 10 * (c_assigned_bb[:, :2] - c_anc[:, :2]) / c_anc[:, 2:]
    offset_wh = 5 * torch.log(eps + c_assigned_bb[:, 2:] / c_anc[:, 2:])

    # 进行拼接再返回
    offset = torch.cat([offset_xy, offset_wh], axis=1)
    return offset


# 功能: 使用真实边界框标记锚框
def multibox_target(anchors, labels):

    # 其中这里的label相当于ground-true, torch.Size([1, 5, 5])
    print("anchors.shape:{}, labels.shape:{}".format(anchors.shape, labels.shape))

    # 这里的batch_size相当于表示有多少张图像, 这里的lable表示只有一张图像, 然后图像中有5个标注对象
    batch_size, anchors = labels.shape[0], anchors.squeeze(0)
    batch_offset, batch_mask, batch_class_labels = [], [], []
    # 指定设备与anchor的数量
    device, num_anchors = anchors.device, anchors.shape[0]

    # 这里的batch_size相当于有多少张图像, 循环处理
    for i in range(batch_size):

        # 提取第i张图像的标签信息
        label = labels[i, :, :]

        # 分配anchor, 这里的label标签的第一列是类别信息(暂时剔除), 而2-5列为标注的坐标信息
        anchors_bbox_map = assign_anchor_to_bbox(
            label[:, 1:], anchors, device)
        # anchors_bbox_map中大于0的值标志成1, 否则为0, 重复4遍, 对应4个坐标偏移量
        bbox_mask = ((anchors_bbox_map >= 0).float().unsqueeze(-1)).repeat(1, 4)

        # 将类标签和分配的边界框坐标初始化为零
        class_labels = torch.zeros(num_anchors, dtype=torch.long, device=device)
        assigned_bb = torch.zeros((num_anchors, 4), dtype=torch.float32, device=device)

        # 使用真实边界框来标记锚框的类别。
        # 如果一个锚框没有被分配，我们标记其为背景（值为零）
        indices_true = torch.nonzero(anchors_bbox_map >= 0)
        bb_idx = anchors_bbox_map[indices_true]
        class_labels[indices_true] = label[bb_idx, 0].long() + 1    # 对应的真实标签
        assigned_bb[indices_true] = label[bb_idx, 1:]               # 对应的真实边界框
        # 其中* bbox_mask表示没有匹配到ground_truth的anchor不需要进行处理
        offset = offset_boxes(anchors, assigned_bb) * bbox_mask     # 边界框进行偏移量转换

        # 对一张图像处理完之后进行信息添加
        batch_offset.append(offset.reshape(-1))
        batch_mask.append(bbox_mask.reshape(-1))
        batch_class_labels.append(class_labels)

    bbox_offset = torch.stack(batch_offset)
    bbox_mask = torch.stack(batch_mask)
    class_labels = torch.stack(batch_class_labels)

    return (bbox_offset, bbox_mask, class_labels)

测试代码：

if __name__ == '__main__':

    # imagepath = 'E:\学习\机器学习\数据集\VOC2012\VOCdevkit\VOC2012\JPEGImages\\2007_001423.jpg'
    imagepath = 'E:\学习\机器学习\数据集\VOC2012\VOCdevkit\VOC2012\JPEGImages\\2007_001526.jpg'

    image = plt.imread(imagepath)
    # print("image.shape:{}".format(image.shape))

    h, w = image.shape[:2]
    X = torch.rand(size=(1, 3, h, w))

    # 设置大小比例以及宽高比(这里的sizes指是原图的一个比例大小)
    sizes = [0.75, 0.5, 0.25]
    ratios = [1, 2, 0.5]

    # 另一种设置anchor的方法是指定不同anchor的面积
    Y = multibox_prior(X, sizes, ratios)
    boxes = Y.reshape(h, w, 5, 4)
    # print(boxes.shape)

    # 这里设置了两个像素点，所以有10个anchor
    anchors = torch.cat([boxes[150, 150, :, :], boxes[150, 300, :, :]], dim=0)  # 对数据进行拼接
    ground_truth = image_to_boxes(imagepath)    # 原尺寸的值
    print(ground_truth)
    ground_truth = gtboxes_process(X, ground_truth)   # 归一化处理后的值
    print(ground_truth)

    # 返回输入anchor的类，掩码变量以及每个anchor的偏移量
    labels = multibox_target(anchors.unsqueeze(dim=0), ground_truth.unsqueeze(dim=0))
    print(labels[2], labels[2].shape)
    print(labels[1], labels[1].shape)
    print(labels[0], labels[0].shape)

输出：

tensor([[  3,  18, 114, 298],
        [109,  51, 214, 297],
        [212,  35, 316, 297],
        [289,  33, 387, 297],
        [381,  15, 500, 297]])
tensor([[0.0000, 0.0060, 0.0604, 0.2280, 1.0000],
        [0.0000, 0.2180, 0.1711, 0.4280, 0.9966],
        [0.0000, 0.4240, 0.1174, 0.6320, 0.9966],
        [0.0000, 0.5780, 0.1107, 0.7740, 0.9966],
        [0.0000, 0.7620, 0.0503, 1.0000, 0.9966]])
anchors.shape:torch.Size([1, 10, 4]), labels.shape:torch.Size([1, 5, 5])
tensor([[0, 0, 0, 1, 1, 1, 0, 0, 1, 1]]) torch.Size([1, 10])
tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1.,
         1., 1., 1., 1.]]) torch.Size([1, 40])
tensor([[-0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000,
         -0.0000, -0.0000, -0.0000, -0.0000, -2.9107,  0.4746, -5.2323,  2.8598,
          0.6960,  0.7435, -2.0444, -1.2533,  1.6779,  0.6488, -4.1222,  0.8327,
         -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000,
          4.4293,  0.3480, -4.8843,  2.8953, -2.3096,  0.4904, -2.0922, -0.9382]]) torch.Size([1, 40])

可以看见输出值包含了3个类别：分别是anchor偏移量，anchor掩码（负样本掩码为0，正样本掩码为1），还有anchor类别标签

5. 使用非极大值抑制预测边界框

anchors = torch.tensor([[ 0.2120,  0.2450,  0.5100,  0.8550],   # 0.9
                        [ 0.1920,  0.2730,  0.4870,  0.8430],   # 0.8
                        [ 0.1720,  0.3040,  0.4540,  0.7490],   # 0.7
                        [ 0.2720,  0.3340,  0.4040,  0.6090]])  # 0.3

offset_preds = torch.tensor([0] * anchors.numel())
cls_probs = torch.tensor([[0] * 4,  # 背景的预测概率
                      [0.9, 0.8, 0.7, 0.1]])  # 人的预测概率
                          
fig = plt.imshow(image)
show_bboxes(fig.axes, anchors * bbox_scale)

# 功能: 将锚框和偏移量预测作为输入，并应用逆偏移变换来返回预测的边界框坐标
def offset_inverse(anchors, offset_preds):

    # 将(左上，右下）转换到（中间，宽度，高度）形式
    anc = box_corner_to_center(anchors)

    # 反公式推导
    pred_bbox_xy = (offset_preds[:, :2] * anc[:, 2:] / 10) + anc[:, :2]
    pred_bbox_wh = torch.exp(offset_preds[:, 2:] / 5) * anc[:, 2:]
    pred_bbox = torch.cat((pred_bbox_xy, pred_bbox_wh), axis=1)     # 先拼接再转换处理

    # 从（中间，宽度，高度）转换到（左上，右下）
    predicted_bbox = box_center_to_corner(pred_bbox)
    return predicted_bbox


# 功能: 实现NMS算法
def nms(boxes, scores, iou_threshold):

    # 对预测边界框的置信度进行排序
    B = torch.argsort(scores, dim=-1, descending=True)
    keep = []  # 保留预测边界框的指标

    while B.numel() > 0:

        # 首先获取置信度最高的索引, 添加在列表k中
        i = B[0]
        keep.append(i)

        # 如果剩下最后一个元素则返回
        if B.numel() == 1: break

        # 分别将置信度最高的anchor与其他的anchor计算iou值
        iou = box_iou(boxes[i, :].reshape(-1, 4),
                      boxes[B[1:], :].reshape(-1, 4)).reshape(-1)

        # 当与置信度最高的anchor的iou值比阈值低时，可以表明这是另外一类的对象，记录索引
        # nonzero返回的是非零数值的索引
        inds = torch.nonzero(iou <= iou_threshold).reshape(-1)

        # 由于前面计算的iou是与最高置信度的anchor进行比较，这里计算出收个不为0的iou时，需要考虑+1
        # 这样进行下一个循环时B里的第一个值就是可能是另外一类的对象
        B = B[inds + 1]

    return torch.tensor(keep, device=boxes.device)


# 功能: 使用非极大值抑制来预测边界框
# 返回一个二维列表, 第一列表示预测类别, 第二列表置信度, 其余四列表示预测边界框的左上角与右下角
def multibox_detection(cls_probs, offset_preds, anchors, nms_threshold=0.5, pos_threshold=0.009999999):

    # 在前面扩维的目标是表示有几幅图像, 表示成batch_size
    device, batch_size = cls_probs.device, cls_probs.shape[0]
    anchors = anchors.squeeze(0)

    # 获取类别与anchor数量信息
    num_classes, num_anchors = cls_probs.shape[1], cls_probs.shape[2]
    out = []

    # 循环处理
    for i in range(batch_size):

        # 提取出每张图像的类别概率与偏移量
        cls_prob, offset_pred = cls_probs[i], offset_preds[i].reshape(-1, 4)
        
        # 从第二行非背景开始, 对概率表格对每一列输出置信度最大值及其对应索引， 索引即为对应类别
        # 由于这里设置的对象都是人, 所以输出类比均为0 (从0开始计数)
        conf, class_id = torch.max(cls_prob[1:], 0)

        # 逆偏移变换来返回预测的边界框坐标, 这里的offset_pred是网络预测的
        predicted_bb = offset_inverse(anchors, offset_pred)
        keep = nms(predicted_bb, conf, nms_threshold)

        # 找到所有的 non_keep 索引，并将类设置为背景
        all_idx = torch.arange(num_anchors, dtype=torch.long, device=device)
        combined = torch.cat((keep, all_idx))

        # 获取唯一值及其分别出现次数
        uniques, counts = combined.unique(return_counts=True)
        non_keep = uniques[counts == 1]
        all_id_sorted = torch.cat((keep, non_keep))
        
        # 抑制值置为-1, 表示重复度过高
        class_id[non_keep] = -1

        # 根据all_id_sorted来调整顺序, 将有可能是ground_truth的anchor弄到前列
        class_id = class_id[all_id_sorted]
        conf, predicted_bb = conf[all_id_sorted], predicted_bb[all_id_sorted]

        # pos_threshold 是一个用于非背景预测的阈值, 将置信度较低的预测边界框移除(id 置为 -1)
        below_min_idx = (conf < pos_threshold)
        class_id[below_min_idx] = -1
        conf[below_min_idx] = 1 - conf[below_min_idx]
    
        # 拼接, 添加列表, 对一副图像的处理完成
        pred_info = torch.cat((class_id.unsqueeze(1), conf.unsqueeze(1), predicted_bb), dim=1)
        out.append(pred_info)

    return torch.stack(out)

测试函数，其中设置图像有3个类别，0为背景类别，第二为人类别，第三类为其他类别

anchors = torch.tensor([[0.2120, 0.2450, 0.5100, 0.8550],  # 0.9
                            [0.1920, 0.2730, 0.4870, 0.8430],  # 0.8
                            [0.1720, 0.3040, 0.4540, 0.7490],  # 0.7
                            [0.2720, 0.3340, 0.4040, 0.6090]])  # 0.3
offset_preds = torch.tensor([0] * anchors.numel())
cls_probs = torch.tensor([[0] * 4,                  # 背景的预测概率
                          [0.9, 0.8, 0.7, 0.6],
                          [0.1, 0.2, 0.3, 0.4]])    # 人的预测概率

# 这里先进行扩维
output = multibox_detection(cls_probs.unsqueeze(dim=0),
                            offset_preds.unsqueeze(dim=0),
                            anchors.unsqueeze(dim=0),
                            nms_threshold=0.5)
print(output, output.shape)

E:\anacanda\envs\project\lib\site-packages\ipykernel_launcher.py:5: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
  """ tensor([[[ 0.0000, 0.9000, 0.2120, 0.2450, 0.5100, 0.8550], [ 0.0000, 0.6000, 0.2720, 0.3340, 0.4040, 0.6090], [-1.0000, 0.8000, 0.1920, 0.2730, 0.4870, 0.8430], [-1.0000, 0.7000, 0.1720, 0.3040, 0.4540, 0.7490]]]) torch.Size([1, 4, 6])

可以看见输出结果可以除去重复度较高的iou预测边界框，并且输出类别信息。现在查看筛选出来的两个边界框，可以看见，相比之下去除了两个iou比较重复的边界框，使得结果更加清晰。

fig = plt.imshow(image)
    for i in output[0]:
        if i[0] == -1:
            continue
        label = ('dog=', 'cat=')[int(i[0])] + str(i[1])
        show_bboxes(fig.axes, [i[2:] * bbox_scale], label)

    plt.show()

参考资料：

李沐大神的《动手学深度学习》

今天的文章【22】目标检测中锚框（anchor）的相关总结分享到此就结束了，感谢您的阅读。

版权声明：本文内容由互联网用户自发贡献，该文观点仅代表作者本人。本站仅提供信息存储空间服务，不拥有所有权，不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容，请发送邮件至举报，一经查实，本站将立刻删除。
如需转载请保留出处：https://bianchenghao.cn/20560.html

【22】目标检测中锚框（anchor）的相关总结

1. 目标检测和边界框

2. 锚框

3. 交并比

4. 将真实边界框分配给锚框

5. 使用非极大值抑制预测边界框

相关推荐

发表回复