Yolov5学习笔记4——源码剖析——Head部分

Detect类对应yolov5的检查头(head)部分

Detect类在yolo.py程序中的33行。

class Detect()代码分析

class Detect(nn.Module):
    stride = None  # strides computed during build
    onnx_dynamic = False  # ONNX export parameter

    def __init__(self, nc=80, anchors=(), ch=(), inplace=True):  # detection layer
        super().__init__()
        self.nc = nc  # number of classes
        self.no = nc + 5  # number of outputs per anchor
        self.nl = len(anchors)  # number of detection layers
        self.na = len(anchors[0]) // 2  # number of anchors
        self.grid = [torch.zeros(1)] * self.nl  # init grid
        self.anchor_grid = [torch.zeros(1)] * self.nl  # init anchor grid
        self.register_buffer('anchors', torch.tensor(anchors).float().view(self.nl, -1, 2))  # shape(nl,na,2)
        self.m = nn.ModuleList(nn.Conv2d(x, self.no * self.na, 1) for x in ch)  # output conv
        self.inplace = inplace  # use in-place ops (e.g. slice assignment)

    def forward(self, x):
        z = []  # inference output
        for i in range(self.nl):
            x[i] = self.m[i](x[i])  # conv
            bs, _, ny, nx = x[i].shape  # x(bs,255,20,20) to x(bs,3,20,20,85)
            x[i] = x[i].view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous()

            if not self.training:  # inference
                if self.onnx_dynamic or self.grid[i].shape[2:4] != x[i].shape[2:4]:
                    self.grid[i], self.anchor_grid[i] = self._make_grid(nx, ny, i)

                y = x[i].sigmoid()
                if self.inplace:
                    y[..., 0:2] = (y[..., 0:2] * 2 - 0.5 + self.grid[i]) * self.stride[i]  # xy
                    y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i]  # wh
                else:  # for YOLOv5 on AWS Inferentia https://github.com/ultralytics/yolov5/pull/2953
                    xy = (y[..., 0:2] * 2 - 0.5 + self.grid[i]) * self.stride[i]  # xy
                    wh = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i]  # wh
                    y = torch.cat((xy, wh, y[..., 4:]), -1)
                z.append(y.view(bs, -1, self.no))

        return x if self.training else (torch.cat(z, 1), x)

初始化函数init()

首先分析这个类的初始化函数：

def __init__(self, nc=80, anchors=(), ch=(), inplace=True):  # 检测头
    super().__init__()
    self.nc = nc  # 类别数
    self.no = nc + 5  # 输出的锚点数量
    self.nl = len(anchors)  # 检测的层数
    self.na = len(anchors[0]) // 2  # 锚点的数量
    self.grid = [torch.zeros(1)] * self.nl  # 初始化网格
    self.anchor_grid = [torch.zeros(1)] * self.nl  # 初始化锚点框
    self.register_buffer('anchors', torch.tensor(anchors).float().view(self.nl, -1, 2))  # shape(nl,na,2)=shape(3,3,2)
    self.m = nn.ModuleList(nn.Conv2d(x, self.no * self.na, 1) for x in ch)  # 输出卷积结果
    self.inplace = inplace  # use in-place ops (e.g. slice assignment)

yolov5的检测头仍为FPN结构，所以self.m为3个输出卷积。这三个输出卷积模块的channel变化分别为128$\longrightarrow$255|256$\longrightarrow$255|512$\longrightarrow$255。
self.no为每个anchor位置的输出channel维度，每个位置都预测80个类（coco）+ 4个位置坐标xywh + 1个confidence score。所以输出channel为85。每个尺度下有3个anchor位置，所以输出85*3=255个channel。检测层数为3，锚点数量为85

forward()函数

接下来看head部分的forward()函数：

def forward(self, x):
    z = []  # inference output
    for i in range(self.nl):
        x[i] = self.m[i](x[i])  # conv
        bs, _, ny, nx = x[i].shape  # x(bs,255,20,20) to x(bs,3,20,20,85)
        x[i] = x[i].view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous()

        if not self.training:  # inference
            if self.onnx_dynamic or self.grid[i].shape[2:4] != x[i].shape[2:4]:
                self.grid[i], self.anchor_grid[i] = self._make_grid(nx, ny, i)

            y = x[i].sigmoid()
            if self.inplace:
                y[..., 0:2] = (y[..., 0:2] * 2 - 0.5 + self.grid[i]) * self.stride[i]  # xy
                y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i]  # wh
            else:  # for YOLOv5 on AWS Inferentia https://github.com/ultralytics/yolov5/pull/2953
                xy = (y[..., 0:2] * 2 - 0.5 + self.grid[i]) * self.stride[i]  # xy
                wh = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i]  # wh
                y = torch.cat((xy, wh, y[..., 4:]), -1)
            z.append(y.view(bs, -1, self.no))

    return x if self.training else (torch.cat(z, 1), x)

x是一个列表的形式，分别对应着3个head的输入。它们的shape分别为：

[bs, 128, 32, 32]
[1, 256, 16, 16]
[1, 512, 8, 8]

三个输入先后被送入了3个卷积，得到输出结果。

1	x[i] = x[i].view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous()

这里将x进行变换从：

x[0]：(bs,255,32,32) => x(bs,3,32,32,85)
x[1]：(bs,255,32,32) => x(bs,3,16,16,85)
x[2]：(bs,255,32,32) => x(bs,3,8,8,85)

make_grid()函数

def _make_grid(self, nx=20, ny=20, i=0):
    d = self.anchors[i].device
    if check_version(torch.__version__, '1.10.0'):  # torch>=1.10.0 meshgrid workaround for torch>=0.7 compatibility
        yv, xv = torch.meshgrid([torch.arange(ny, device=d), torch.arange(nx, device=d)], indexing='ij')
    else:
        yv, xv = torch.meshgrid([torch.arange(ny, device=d), torch.arange(nx, device=d)])
    grid = torch.stack((xv, yv), 2).expand((1, self.na, ny, nx, 2)).float()
    anchor_grid = (self.anchors[i].clone() * self.stride[i]) \
        .view((1, self.na, 1, 1, 2)).expand((1, self.na, ny, nx, 2)).float()
    return grid, anchor_grid

这里的_make_grid()函数是准备好格点。所有的预测的单位长度都是基于grid层面的而不是原图。注意每一层的grid的尺寸都是不一样的，和每一层输出的尺寸w,h是一样的。

y = x[i].sigmoid()
if self.inplace:
    y[..., 0:2] = (y[..., 0:2] * 2 - 0.5 + self.grid[i]) * self.stride[i]  # xy
    y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i]  # wh
else:  # for YOLOv5 on AWS Inferentia https://github.com/ultralytics/yolov5/pull/2953
    xy = (y[..., 0:2] * 2 - 0.5 + self.grid[i]) * self.stride[i]  # xy
    wh = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i]  # wh
    y = torch.cat((xy, wh, y[..., 4:]), -1)
z.append(y.view(bs, -1, self.no))

这里是inference的核心代码，对应的是yolov5的bbox回归机制。yolov5的回归机制如下图所示：

相较于yolov3的回归机制，可以明显的发现box center的x，y的预测被乘以2并减去了0.5，所以这里的值域从yolov3里的(0，1)注意是开区间，变成了(-0.5， 1.5)。从表面理解是yolov5可以跨半个格点预测了，这样可以提高对格点周围的bbox的召回。当然还有一个好处就是也解决了yolov3中因为sigmoid开区间而导致中心无法到达边界处的问题。

同样，在w，h的回归上，yolov5也有了新的变化，同样对比yolov3的源代码：

x = torch.sigmoid(prediction[..., 0])  # Center x  #B A H W
y = torch.sigmoid(prediction[..., 1])  # Center y  #B A H W
w = prediction[..., 2]  # Width                    #B A H W
h = prediction[..., 3]  # Height                   #B A H W
pred_conf = torch.sigmoid(prediction[..., 4])  # Conf
pred_cls = torch.sigmoid(prediction[..., 5:])  # Cls pred.

很明显yolov3对于w，h没有做sigmoid，而在yolov5中对于x，y，w，h都做了sigmoid。其次yolov5的预测缩放比例变成了：(2*w_pred/h_pred) ^2。
值域从基于anchor宽高的（0，+∞）变成了（0，4）。这可能目的在于使预测的框范围更精准，通过sigmoid约束，让回归的框比例尺寸更为合理。

class Model()代码分析

接下来分析Model类里面的函数。主要分析它的前向传播过程，这里有两个函数：forward()和forward_once()。

forward()函数

def forward(self, x, augment=False, profile=False, visualize=False):
    if augment:
        return self._forward_augment(x)  # augmented inference, None
    return self._forward_once(x, profile, visualize)  # single-scale inference, train

def _forward_augment(self, x):
    img_size = x.shape[-2:]  # height, width
    s = [1, 0.83, 0.67]  # scales
    f = [None, 3, None]  # flips (2-ud, 3-lr)
    y = []  # outputs
    for si, fi in zip(s, f):
        xi = scale_img(x.flip(fi) if fi else x, si, gs=int(self.stride.max()))
        yi = self._forward_once(xi)[0]  # forward
        # cv2.imwrite(f'img_{si}.jpg', 255 * xi[0].cpu().numpy().transpose((1, 2, 0))[:, :, ::-1])  # save
        yi = self._descale_pred(yi, fi, si, img_size)
        y.append(yi)
    y = self._clip_augmented(y)  # clip augmented tails
    return torch.cat(y, 1), None  # augmented inference, train

self.forward()函数里面augment可以理解为控制TTA，如果打开会对图片进行scale和flip。默认是关闭的。

scale_img的源码如下：

scale_img()函数

def scale_img(img, ratio=1.0, same_shape=False, gs=32):  # img(16,3,256,416)
    # Scales img(bs,3,y,x) by ratio constrained to gs-multiple
    if ratio == 1.0:
        return img
    else:
        h, w = img.shape[2:]
        s = (int(h * ratio), int(w * ratio))  # new size
        img = F.interpolate(img, size=s, mode='bilinear', align_corners=False)  # resize
        if not same_shape:  # pad/crop img
            h, w = (math.ceil(x * ratio / gs) * gs for x in (h, w))
        return F.pad(img, [0, w - s[1], 0, h - s[0]], value=0.447)  # value = imagenet mean

通过普通的双线性插值实现，根据ratio来控制图片的缩放比例，最后通过pad 0补齐到原图的尺寸。

forward_once()函数

def _forward_once(self, x, profile=False, visualize=False):
    y, dt = [], []  # outputs
    for m in self.model:
        if m.f != -1:  # if not from previous layer
            x = y[m.f] if isinstance(m.f, int) else [x if j == -1 else y[j] for j in m.f]  # from earlier layers
        if profile:
            self._profile_one_layer(m, x, dt)
        x = m(x)  # run
        y.append(x if m.i in self.save else None)  # save output
        if visualize:
            feature_visualization(x, m.type, m.i, save_dir=visualize)
    return x