YOLOv8 源码解读:模型定义与推理流程

本文深入解读 YOLOv8 的核心源码,包括模型配置文件解析、网络结构定义、前向传播流程和推理后处理,帮助理解其内部实现细节。

模型配置文件解析

YAML 配置结构

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
# yolov8.yaml 核心结构

# 参数
nc: 80 # 类别数
scales:
# n/s/m/l/x 五种尺寸
n: [0.33, 0.25, 1024]
s: [0.33, 0.50, 1024]
# ...

# 骨干网络
backbone:
- [-1, 1, 64, 3, 2] # 0: Conv
- [-1, 1, 128, 3, 2] # 2: Conv
- [-1, 3, 128, 1, 1] # 3: C2f
- [-1, 1, 256, 3, 2] # 4: Conv
- [-1, 6, 256, 1, 1] # 5: C2f
- [-1, 1, 512, 3, 2] # 6: Conv
- [-1, 6, 512, 1, 1] # 7: C2f
- [-1, 1, 512, 1, 1] # 8: SPPF
- [-1, 1, 1024, 3, 2] # 9: Conv
- [-1, 6, 1024, 1, 1] # 10: C2f

# 检测头
head:
- [-1, 1, 512, 1, 1] # 11: upsample
- [-1, 6, 512, 1, 1] # 12: Concat
- [-1, 6, 512, 1, 1] # 13: C2f
- [-1, 1, 256, 1, 1] # 14: upsample
- [-1, 6, 256, 1, 1] # 15: Concat
- [-1, 6, 256, 1, 1] # 16: C2f
- [[17, 14, 10], 1, 256, 1, 0] # 17: Detect

配置解析代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
# ultralytics/nn/tasks.py

def parse_model(d, ch):
"""解析 YAML 配置,构建模型"""
import math

# Logger
LOGGER.info(f"\n{'':>3}{'from':>20}{'n':>3}{'params':>10} {'module':<45}{'arguments':<30}")

# 统计
nc, act, scales = d['nc'], d.get('activation'), d.get('scales')
depth, width, kpt_shape = (d.get(x, 1.0) for x in ('depth_multiple', 'width_multiple', 'kpt_shape'))

# 定义层
layers, save, c2 = [], [], ch[-1]

for i, (f, n, m, args) in enumerate(d['backbone'] + d['head']):
# 解析模块名
m = eval(m) if isinstance(m, str) else m

# 深度缩放
n = max(round(n * depth), 1) if n > 1 else n

# 宽度缩放
if m in (Conv, C2f, Bottleneck, SPPF, SPP, DWConv, DWConvTranspose2d, ConvTranspose):
c1, c2 = ch[f], args[0]
c2 = max(math.ceil(c2 * width), 64)
args = [c1, c2, *args[1:]]

elif m is nn.BatchNorm2d:
args = [ch[f]]

# 实例化模块
m = nn.Sequential(*(m(*args) for _ in range(n))) if n > 1 else m(*args)

# 记录输出通道
t = str(m)[-1:-21:-1] # type
m.i, m.f, m.type, m.np = i, f, t, len(m) # index, from, type, number of params

# 输出通道数
c2 = ch.append(c2) if isinstance(c2, int) else ch.append(c2[-1])

return nn.Sequential(*layers)

网络结构定义

基础模块

Conv 卷积模块

1
2
3
4
5
6
7
8
9
10
11
12
13
14
# ultralytics/nn/modules.py

class Conv(nn.Module):
"""标准卷积:Conv + BN + SiLU"""
def __init__(self, c1, c2, k=1, s=1, p=None, g=1, d=1, act=True):
super().__init__()
# 1x1 卷积降维
self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p, d), groups=g, dilation=d, bias=False)
self.bn = nn.BatchNorm2d(c2)
# SiLU (Sigmoid Linear Unit) / Swish
self.act = nn.SiLU() if act is True else (act if isinstance(act, nn.Module) else nn.Identity())

def forward(self, x):
return self.act(self.bn(self.conv(x)))

Bottleneck 残差块

1
2
3
4
5
6
7
8
9
10
11
class Bottleneck(nn.Module):
"""标准残差块"""
def __init__(self, c1, c2, shortcut=True, g=1, k=(3, 3), e=0.5):
super().__init__()
c_ = int(c2 * e) # 隐藏层通道
self.cv1 = Conv(c1, c_, k[0], 1) # 1x1 降维
self.cv2 = Conv(c_, c2, k[1], 1, g=g) # 3x3 升维
self.add = shortcut and c1 == c2 # 残差连接

def forward(self, x):
return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x))

C2f 模块(核心创新)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
class C2f(nn.Module):
"""YOLOv8 核心模块,比 C3 保留更多梯度"""
def __init__(self, c1, c2, n=1, shortcut=False, g=1, e=0.5):
super().__init__()
self.c = int(c2 * 2) # 隐藏通道
self.cv1 = Conv(c1, 2 * self.c, 1, 1) # 通道分割
self.cv2 = Conv((2 + n) * self.c, c2, 1) # 输出
self.m = nn.ModuleList(Bottleneck(self.c, self.c, shortcut, g, k=((3, 3)), e=1.0) for _ in range(n))

def forward(self, x):
# 分割为两份
y = list(self.cv1(x).split((self.c, self.c), 1))
# 通过所有 Bottleneck,保留完整梯度流
y.extend(m(y[-1]) for m in self.m)
return self.cv2(torch.cat(y, 1))

C2f vs C3 对比

1
2
3
4
5
6
7
8
9
10
11
# C3 (YOLOv5):
# - 残差分支只有 Bottleneck 的输出
# - 丢失了部分原始特征

# C2f (YOLOv8):
# - 残差分支包含完整特征
# - 保留了输入信息的完整性
# - 梯度流更顺畅

# 效果:
# C2f 增加了少量计算,但显著提升了梯度传递

SPPF 模块

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
class SPPF(nn.Module):
"""空间金字塔池化 - Fast 版本"""
def __init__(self, c1, c2, k=5):
super().__init__()
c_ = c1 // 2
self.cv1 = Conv(c1, c_, 1, 1)
self.cv2 = Conv(c_ * 4, c2, 1, 1)
self.m = nn.MaxPool2d(kernel_size=k, stride=1, padding=k // 2)

def forward(self, x):
x = self.cv1(x)
y1 = self.m(x)
y2 = self.m(y1)
y3 = self.m(y2)
return self.cv2(torch.cat([x, y1, y2, y3], 1))

前向传播流程

整体流程

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# ultralytics/nn/tasks.py

class DetectionModel(BaseModel):
def __init__(self, cfg='yolov8.yaml', ch=3, nc=None, verbose=True):
super().__init__(cfg, ch, nc, verbose)

def forward(self, x, *args, **kwargs):
"""前向传播"""
if self.training:
# 训练模式:返回所有输出
return self._forward_once(x)
else:
# 推理模式:返回 NMS 后的结果
return self._forward_once(x)

def _forward_once(self, x):
"""单次前向"""
y, dt = [], [] # 输出列表,时间记录
for m in self.model:
# 处理跳连
if m.f != -1:
x = y[m.f] if isinstance(m.f, int) else [x if j == -1 else y[j] for j in m.f]

# 前向计算
x = m(x)
y.append(x if m.i in self.save else None)

return x

训练模式 Forward

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
# 训练模式下返回原始输出
# 每个 Detect 头的输出

def forward_train(x):
# x: [batch, 3, 640, 640]

# Backbone 输出
x3 = backbone(x) # [batch, 256, 80, 80]
# [batch, 512, 40, 40]
# [batch, 512, 20, 20]

# Neck 融合
x4 = neck(x3)

# Detect 输出
# p: [batch, nc+4*reg_max, 80, 80] (小目标)
# p: [batch, nc+4*reg_max, 40, 40] (中目标)
# p: [batch, nc+4*reg_max, 20, 20] (大目标)
p = detect_head(x4)

return p

推理模式 Forward

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
# 推理模式下,Detect 包含后处理

class Detect(nn.Module):
def __init__(self, nc=80, ch=()):
super().__init__()
self.nc = nc # 类别数
self.nl = len(ch) # 检测层数
self.reg_max = 16 # DFL 通道
self.no = nc + self.reg_max * 4 # 输出通道
self.stride = torch.zeros(self.nl) # 步长

def forward(self, x):
"""推理前向,返回 NMS 前的原始输出"""
for i in range(self.nl):
x[i] = torch.cat([self.cv2[i](x[i]), self.cv3[i](x[i])], 1)

# 训练模式返回原始输出
if self.training:
return x

# 推理模式:拼接所有尺度输出
return torch.cat([xi.view(xi.shape[0], self.no, -1) for xi in x], 2)

NMS 后处理

NMS 实现

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# ultralytics/utils/ops.py

def non_max_suppression(
prediction,
conf_thres=0.25,
iou_thres=0.45,
classes=None,
agnostic=False,
multi_label=False,
max_det=300,
):
"""
NMS 非极大值抑制
"""
# 设备
bs = prediction.shape[0] # batch size
nc = prediction.shape[2] - self.no # 类别数
xc = prediction[..., 4] > conf_thres # 置信度过滤

# 输出
output = [torch.zeros((0, 6))] * bs
xi = 0

# 遍历每张图
for i in range(bs):
x = prediction[i]

# 置信度过滤
x = x[xc[i]]

# 无检测框
if not x.shape[0]:
continue

# 解析:box + conf + cls
box = x[:, :4] # 边界框
conf = x[:, 4:5] # 置信度
cls = x[:, 5:].argmax(1, keepdim=True) # 类别

# 多标签(非 YOLOv8 常用)
if multi_label:
x[:, 5:] = x[:, 5:] > conf_thres

# 检测框数量
n = x.shape[0]

# 边界框坐标转换(xyxy -> xywh)
box = xywh2xyxy(box)

# 按类别循环或全部一起处理
if not agnostic:
c = x[:, 5:6] * (0 if agnostic else 4096)
else:
c = x[:, 5:6] * 0

# 按置信度排序
boxes, scores = box + c, conf.squeeze(1)
i = torchvision.ops.nms(boxes, scores, iou_thres) # NMS
i = i[:max_det] # 限制数量

output[i] = torch.cat((boxes[i], scores[i:i+1, None], cls[i:i+1, None].float()), 1)

return output

坐标转换

1
2
3
4
5
6
7
8
9
10
# ultralytics/utils/ops.py

def xywh2xyxy(x):
"""中心点格式 -> 左上右下格式"""
y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
y[..., 0] = x[..., 0] - x[..., 2] / 2 # x1
y[..., 1] = x[..., 1] - x[..., 3] / 2 # y1
y[..., 2] = x[..., 0] + x[..., 2] / 2 # x2
y[..., 3] = x[..., 1] + x[..., 3] / 2 # y2
return y

后处理完整流程

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
def postprocess(preds, img_size, orig_imgs):
"""推理后处理完整流程"""

# 1. 解析预测输出
# preds: [batch, num_boxes, 4+1+nc]

# 2. 坐标反归一化(如果是归一化的输出)
preds[..., :4] = scale_coords(img_size, preds[..., :4], orig_imgs.shape)

# 3. NMS
preds = non_max_suppression(
preds,
conf_thres=0.25,
iou_thres=0.45,
max_det=300,
)

return preds

推理结果解析

结果结构

1
2
3
4
5
6
7
8
9
10
# ultralytics/engine/results.py

class Results:
def __init__(self, orig_img, path, names, boxes=None, masks=None, probs=None):
self.orig_img = orig_img
self.boxes = Boxes(boxes, orig_img.shape) if boxes is not None else None
self.masks = Masks(masks, orig_img.shape) if masks is not None else None
self.probs = probs if probs is not None else None
self.names = names
self.path = path

Boxes 对象

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
class Boxes:
"""检测框封装"""
def __init__(self, boxes, orig_shape):
self.boxes = boxes # [x1, y1, x2, y2, conf, cls]
self.orig_shape = orig_shape

@property
def xyxy(self):
"""返回 xyxy 格式边界框"""
return self.boxes[:, :4]

@property
def xywh(self):
"""返回 xywh 格式边界框"""
boxes = self.boxes[:, :4]
boxes[:, 2] = boxes[:, 2] - boxes[:, 0] # w
boxes[:, 3] = boxes[:, 3] - boxes[:, 1] # h
boxes[:, 0] = boxes[:, 0] + boxes[:, 2] / 2 # cx
boxes[:, 1] = boxes[:, 1] + boxes[:, 3] / 2 # cy
return boxes

@property
def conf(self):
"""返回置信度"""
return self.boxes[:, 4]

@property
def cls(self):
"""返回类别"""
return self.boxes[:, 5]

使用示例

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
from ultralytics import YOLO

model = YOLO('best.pt')
results = model('test.jpg')

# 遍历结果
for r in results:
boxes = r.boxes # 检测框

# 方法1:numpy 数组
print(boxes.xyxy) # [x1, y1, x2, y2]
print(boxes.conf) # 置信度
print(boxes.cls) # 类别索引

# 方法2:Tensor
print(boxes.xyxy[0]) # 单个框

# 方法3:Python 标量
for box in boxes:
x1, y1, x2, y2 = box.xyxy.tolist()
conf = float(box.conf)
cls = int(box.cls)
name = r.names[cls]
print(f"{name}: {conf:.2f} at [{x1:.0f}, {y1:.0f}, {x2:.0f}, {y2:.0f}]")

关键代码片段分析

DFL (Distribution Focal Loss)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
# YOLOv8 使用 DFL 将回归转为分类

class DistributionFocalLoss(nn.Module):
def __init__(self, reg_max):
super().__init__()
self.reg_max = reg_max

def forward(self, pred_dist, target):
# pred_dist: [N, 4*reg_max, H*W]
# target: [N, 4, H*W]

# 获取目标位置
target = target.clamp(0, self.reg_max - 1 - 0.01)

# 展开分布
# 计算每个离散位置的损失
tl = target.long() # 目标左边界
tr = tl + 1 # 右边界

# 加权求和
# loss = -log(p_left * (1-weight) + p_right * weight)

训练时的损失计算

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
# ultralytics/models/yolo/detect/train.py

class DetectionTrainer(Trainer):
def build_targets(self, targets, batch_size):
"""构建训练目标"""
# 为每个 GT 生成正样本
# 使用 TAL (Task Alignment Learning)
pass

def compute_loss(self, preds, batch):
"""计算损失"""
loss = torch.zeros(3, device=self.device)

# 解析预测
pred = preds[0] if isinstance(preds, tuple) else preds

# Box Loss
loss[0] = self.bce(pred.box, target.bbox)

# Classification Loss
loss[1] = self.bce(pred.cls, target.cls)

# DFL Loss
loss[2] = self.dfl(pred.dist, target.dist)

return loss.sum() * self.loss_gain

总结

YOLOv8 源码核心要点:

模块 位置 说明
Conv modules.py Conv+BN+SiLU
C2f modules.py 核心特征融合模块
SPPF modules.py 空间金字塔池化
Detect modules.py 检测头
parse_model tasks.py YAML 配置解析
forward tasks.py 整体前向
NMS ops.py 非极大值抑制
Results results.py 结果封装

推理流程:

  1. 输入预处理:Letterbox 缩放 + 归一化
  2. Backbone:特征提取 + 下采样
  3. Neck:多尺度特征融合
  4. Head:分类 + 回归
  5. 后处理:坐标转换 + NMS
  6. 输出:检测框 + 置信度 + 类别