.YOLO-Worldyolo_worldmodelsdetectors__init__.py
代码语言:javascript
复制# 版权声明,版权归腾讯公司所有
# 导入yolo_world模块中的YOLOWorldDetector类
from .yolo_world import YOLOWorldDetector
# 导出YOLOWorldDetector类,供外部使用
__all__ = ['YOLOWorldDetector']
.YOLO-Worldyolo_worldmodelslayersyolo_bricks.py
代码语言:javascript
复制# 版权声明,版权归腾讯公司所有
from typing import List # 导入 List 类型
import torch # 导入 torch 库
import torch.nn as nn # 导入 torch.nn 模块
from torch import Tensor # 导入 Tensor 类型
import torch.nn.functional as F # 导入 torch.nn.functional 模块
from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule, Linear # 导入 mmcv.cnn 模块中的 ConvModule、DepthwiseSeparableConvModule、Linear 类
from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig # 导入 mmdet.utils 模块中的 ConfigType、OptConfigType、OptMultiConfig 类
from mmengine.model import BaseModule # 导入 mmengine.model 模块中的 BaseModule 类
from mmyolo.registry import MODELS # 导入 mmyolo.registry 模块中的 MODELS 注册器
from mmyolo.models.layers import CSPLayerWithTwoConv # 导入 mmyolo.models.layers 模块中的 CSPLayerWithTwoConv 类
@MODELS.register_module() # 使用 MODELS 注册器注册该类
class MaxSigmoidAttnBlock(BaseModule): # 定义 MaxSigmoidAttnBlock 类,继承自 BaseModule
"""Max Sigmoid attention block.""" # 类的简要描述
# 初始化函数,定义了模型的各种参数
def __init__(self,
in_channels: int,
out_channels: int,
guide_channels: int,
embed_channels: int,
kernel_size: int = 3,
padding: int = 1,
num_heads: int = 1,
use_depthwise: bool = False,
with_scale: bool = False,
conv_cfg: OptConfigType = None,
norm_cfg: ConfigType = dict(type='BN',
momentum=0.03,
eps=0.001),
init_cfg: OptMultiConfig = None) -> None:
# 调用父类的初始化函数
super().__init__(init_cfg=init_cfg)
# 根据是否使用深度可分离卷积选择不同的卷积模块
conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule
# 检查输出通道数和嵌入通道数是否能被头数整除
assert (out_channels % num_heads == 0 and
embed_channels % num_heads == 0),
'out_channels and embed_channels should be divisible by num_heads.'
# 设置头数和每个头的通道数
self.num_heads = num_heads
self.head_channels = out_channels // num_heads
# 如果嵌入通道数不等于输入通道数,则定义一个卷积模块用于嵌入
self.embed_conv = ConvModule(
in_channels,
embed_channels,
1,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=None) if embed_channels != in_channels else None
# 定义一个全连接层用于引导通道到嵌入通道的映射
self.guide_fc = Linear(guide_channels, embed_channels)
# 定义一个偏置参数
self.bias = nn.Parameter(torch.zeros(num_heads))
# 如果设置了缩放参数,则定义一个缩放参数
if with_scale:
self.scale = nn.Parameter(torch.ones(1, num_heads, 1, 1))
else:
self.scale = 1.0
# 定义一个卷积模块用于将输入通道映射到输出通道
self.project_conv = conv(in_channels,
out_channels,
kernel_size,
stride=1,
padding=padding,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=None)
# 定义一个前向传播函数,接受输入张量 x 和引导张量 guide,返回处理后的张量
def forward(self, x: Tensor, guide: Tensor) -> Tensor:
"""Forward process."""
# 获取输入张量 x 的形状信息
B, _, H, W = x.shape
# 使用引导张量 guide 经过全连接层处理
guide = self.guide_fc(guide)
# 重新调整 guide 的形状,将其分成多个头部
guide = guide.reshape(B, -1, self.num_heads, self.head_channels)
# 如果存在嵌入卷积层,对输入张量 x 进行卷积操作,否则直接使用 x
embed = self.embed_conv(x) if self.embed_conv is not None else x
# 调整嵌入结果的形状,分成多个头部
embed = embed.reshape(B, self.num_heads, self.head_channels, H, W)
# 使用 einsum 函数计算注意力权重
attn_weight = torch.einsum('bmchw,bnmc->bmhwn', embed, guide)
# 在最后一个维度上取最大值
attn_weight = attn_weight.max(dim=-1)[0]
# 归一化注意力权重
attn_weight = attn_weight / (self.head_channels**0.5)
# 添加偏置项
attn_weight = attn_weight self.bias[None, :, None, None]
# 对注意力权重进行 sigmoid 激活函数处理,并乘以缩放因子
attn_weight = attn_weight.sigmoid() * self.scale
# 对输入张量进行投影卷积
x = self.project_conv(x)
# 调整投影结果的形状,分成多个头部
x = x.reshape(B, self.num_heads, -1, H, W)
# 将投影结果与注意力权重相乘
x = x * attn_weight.unsqueeze(2)
# 调整结果的形状
x = x.reshape(B, -1, H, W)
# 返回处理后的张量
return x
# 使用 @MODELS.register_module() 装饰器注册 MaxSigmoidCSPLayerWithTwoConv 类
@MODELS.register_module()
# 定义 MaxSigmoidCSPLayerWithTwoConv 类,继承自 CSPLayerWithTwoConv 类
class MaxSigmoidCSPLayerWithTwoConv(CSPLayerWithTwoConv):
"""Sigmoid-attention based CSP layer with two convolution layers."""
# 类的说明文档,描述该类是基于 Sigmoid-attention 的 CSP 层,包含两个卷积层
# 初始化函数,定义了网络结构的各种参数
def __init__(
self,
in_channels: int, # 输入通道数
out_channels: int, # 输出通道数
guide_channels: int, # 引导通道数
embed_channels: int, # 嵌入通道数
num_heads: int = 1, # 多头注意力机制的头数,默认为1
expand_ratio: float = 0.5, # 扩展比例,默认为0.5
num_blocks: int = 1, # 块的数量,默认为1
with_scale: bool = False, # 是否使用缩放,默认为False
add_identity: bool = True, # 是否添加身份连接,默认为True
conv_cfg: OptConfigType = None, # 卷积配置,默认为None
norm_cfg: ConfigType = dict(type='BN', momentum=0.03, eps=0.001), # 归一化配置,默认为BatchNorm
act_cfg: ConfigType = dict(type='SiLU', inplace=True), # 激活函数配置,默认为SiLU
init_cfg: OptMultiConfig = None) -> None: # 初始化配置,默认为None,返回None
# 调用父类的初始化函数,传入各种参数
super().__init__(in_channels=in_channels,
out_channels=out_channels,
expand_ratio=expand_ratio,
num_blocks=num_blocks,
add_identity=add_identity,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=act_cfg,
init_cfg=init_cfg)
# 定义最终的卷积层,输入通道数为(3 num_blocks) * self.mid_channels,输出通道数为out_channels
self.final_conv = ConvModule((3 num_blocks) * self.mid_channels,
out_channels,
1,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=act_cfg)
# 定义注意力块,输入通道数为self.mid_channels,输出通道数为self.mid_channels
self.attn_block = MaxSigmoidAttnBlock(self.mid_channels,
self.mid_channels,
guide_channels=guide_channels,
embed_channels=embed_channels,
num_heads=num_heads,
with_scale=with_scale,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg)
# 定义一个前向传播函数,接受输入张量 x 和引导张量 guide,返回处理后的张量
def forward(self, x: Tensor, guide: Tensor) -> Tensor:
"""Forward process."""
# 使用主要卷积层处理输入张量 x
x_main = self.main_conv(x)
# 将处理后的张量按照通道数分割成两部分
x_main = list(x_main.split((self.mid_channels, self.mid_channels), 1))
# 对每个分割后的部分依次应用不同的块
x_main.extend(blocks(x_main[-1]) for blocks in self.blocks)
# 将最后一个处理后的部分与引导张量 guide 一起传入注意力块
x_main.append(self.attn_block(x_main[-1], guide))
# 将所有处理后的部分拼接在一起,然后通过最终卷积层处理得到最终输出
return self.final_conv(torch.cat(x_main, 1))
# 注册 ImagePoolingAttentionModule 类到 MODELS 模块
@MODELS.register_module()
class ImagePoolingAttentionModule(nn.Module):
# 初始化函数,接受多个参数
def __init__(self,
image_channels: List[int],
text_channels: int,
embed_channels: int,
with_scale: bool = False,
num_feats: int = 3,
num_heads: int = 8,
pool_size: int = 3):
# 调用父类的初始化函数
super().__init__()
# 初始化各个属性
self.text_channels = text_channels
self.embed_channels = embed_channels
self.num_heads = num_heads
self.num_feats = num_feats
self.head_channels = embed_channels // num_heads
self.pool_size = pool_size
# 根据 with_scale 参数决定是否添加可学习的缩放参数
if with_scale:
self.scale = nn.Parameter(torch.tensor([0.]), requires_grad=True)
else:
self.scale = 1.0
# 创建投影层,将输入的图像通道数映射到嵌入通道数
self.projections = nn.ModuleList([
ConvModule(in_channels, embed_channels, 1, act_cfg=None)
for in_channels in image_channels
])
# 创建查询、键、值和投影层
self.query = nn.Sequential(nn.LayerNorm(text_channels),
Linear(text_channels, embed_channels))
self.key = nn.Sequential(nn.LayerNorm(embed_channels),
Linear(embed_channels, embed_channels))
self.value = nn.Sequential(nn.LayerNorm(embed_channels),
Linear(embed_channels, embed_channels))
self.proj = Linear(embed_channels, text_channels)
# 创建图像池化层,用于对图像特征进行池化
self.image_pools = nn.ModuleList([
nn.AdaptiveMaxPool2d((pool_size, pool_size))
for _ in range(num_feats)
])
# 前向传播函数,接收文本特征和图像特征作为输入
def forward(self, text_features, image_features):
# 获取 batch size
B = image_features[0].shape[0]
# 断言图像特征列表长度等于预定义的特征数量
assert len(image_features) == self.num_feats
# 计算每个图像特征的像素块数量
num_patches = self.pool_size**2
# 对每个图像特征进行投影和池化操作,然后将结果拼接在一起
mlvl_image_features = [
pool(proj(x)).view(B, -1, num_patches)
for (x, proj, pool
) in zip(image_features, self.projections, self.image_pools)
]
# 将拼接后的图像特征进行维度转置
mlvl_image_features = torch.cat(mlvl_image_features,
dim=-1).transpose(1, 2)
# 对文本特征进行查询操作
q = self.query(text_features)
# 对图像特征进行键值对操作
k = self.key(mlvl_image_features)
v = self.value(mlvl_image_features)
# 将查询、键、值进行维度重塑
q = q.reshape(B, -1, self.num_heads, self.head_channels)
k = k.reshape(B, -1, self.num_heads, self.head_channels)
v = v.reshape(B, -1, self.num_heads, self.head_channels)
# 计算注意力权重
attn_weight = torch.einsum('bnmc,bkmc->bmnk', q, k)
# 缩放注意力权重
attn_weight = attn_weight / (self.head_channels**0.5)
# 对注意力权重进行 softmax 操作
attn_weight = F.softmax(attn_weight, dim=-1)
# 根据注意力权重计算加权值
x = torch.einsum('bmnk,bkmc->bnmc', attn_weight, v)
# 将加权值进行投影操作
x = self.proj(x.reshape(B, -1, self.embed_channels))
# 返回最终结果,加上文本特征并乘以缩放因子
return x * self.scale text_features
# 注册模块为VanillaSigmoidBlock,表示使用Sigmoid激活函数的注意力块
@MODELS.register_module()
class VanillaSigmoidBlock(BaseModule):
"""Sigmoid attention block."""
def __init__(self,
in_channels: int,
out_channels: int,
guide_channels: int,
embed_channels: int,
kernel_size: int = 3,
padding: int = 1,
num_heads: int = 1,
use_depthwise: bool = False,
with_scale: bool = False,
conv_cfg: OptConfigType = None,
norm_cfg: ConfigType = dict(type='BN',
momentum=0.03,
eps=0.001),
init_cfg: OptMultiConfig = None) -> None:
super().__init__(init_cfg=init_cfg)
# 根据是否使用深度可分离卷积选择不同的卷积模块
conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule
# 确保输出通道数和嵌入通道数能够被头数整除
assert (out_channels % num_heads == 0 and
embed_channels % num_heads == 0),
'out_channels and embed_channels should be divisible by num_heads.'
self.num_heads = num_heads
self.head_channels = out_channels // num_heads
# 定义投影卷积层
self.project_conv = conv(in_channels,
out_channels,
kernel_size,
stride=1,
padding=padding,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=None)
def forward(self, x: Tensor, guide: Tensor) -> Tensor:
"""Forward process."""
# 进行投影卷积
x = self.project_conv(x)
# 使用Sigmoid激活函数进行注意力加权
x = x * x.sigmoid()
return x
# 注册模块为EfficientCSPLayerWithTwoConv,表示使用两个卷积层的CSP层,基于Sigmoid注意力机制
@MODELS.register_module()
class EfficientCSPLayerWithTwoConv(CSPLayerWithTwoConv):
"""Sigmoid-attention based CSP layer with two convolution layers."""
# 初始化函数,定义了一个自定义的神经网络模块
def __init__(
self,
in_channels: int, # 输入通道数
out_channels: int, # 输出通道数
guide_channels: int, # 引导通道数
embed_channels: int, # 嵌入通道数
num_heads: int = 1, # 多头注意力机制的头数,默认为1
expand_ratio: float = 0.5, # 扩展比例,默认为0.5
num_blocks: int = 1, # 块的数量,默认为1
with_scale: bool = False, # 是否使用缩放,默认为False
add_identity: bool = True, # 是否添加身份映射,默认为True
conv_cfg: OptConfigType = None, # 卷积配置,默认为None
norm_cfg: ConfigType = dict(type='BN', momentum=0.03, eps=0.001), # 归一化配置,默认为BatchNorm
act_cfg: ConfigType = dict(type='SiLU', inplace=True), # 激活函数配置,默认为SiLU
init_cfg: OptMultiConfig = None) -> None: # 初始化配置,默认为None
# 调用父类的初始化函数
super().__init__(in_channels=in_channels,
out_channels=out_channels,
expand_ratio=expand_ratio,
num_blocks=num_blocks,
add_identity=add_identity,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=act_cfg,
init_cfg=init_cfg)
# 定义最终的卷积层
self.final_conv = ConvModule((3 num_blocks) * self.mid_channels,
out_channels,
1,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=act_cfg)
# 定义注意力块
self.attn_block = VanillaSigmoidBlock(self.mid_channels,
self.mid_channels,
guide_channels=guide_channels,
embed_channels=embed_channels,
num_heads=num_heads,
with_scale=with_scale,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg)
# 定义一个前向传播函数,接受输入张量 x 和引导张量 guide,返回处理后的张量
def forward(self, x: Tensor, guide: Tensor) -> Tensor:
"""Forward process."""
# 使用主要卷积层处理输入张量 x
x_main = self.main_conv(x)
# 将处理后的张量按照通道数分割成两部分
x_main = list(x_main.split((self.mid_channels, self.mid_channels), 1))
# 对每个块中的处理函数对最后一个处理后的张量进行处理,并将结果添加到 x_main 中
x_main.extend(blocks(x_main[-1]) for blocks in self.blocks)
# 将最后一个处理后的张量和引导张量传入注意力块中进行处理,并将结果添加到 x_main 中
x_main.append(self.attn_block(x_main[-1], guide))
# 将所有处理后的张量拼接在一起,并传入最终卷积层进行处理,返回结果
return self.final_conv(torch.cat(x_main, 1))
.YOLO-Worldyolo_worldmodelslayers__init__.py
代码语言:javascript
复制# 版权声明,版权归腾讯公司所有
# 基于 CSPLayers 的 PAFPN 的基本模块
# 导入 yolo_bricks 模块中的相关类
from .yolo_bricks import (
CSPLayerWithTwoConv,
MaxSigmoidAttnBlock,
MaxSigmoidCSPLayerWithTwoConv,
ImagePoolingAttentionModule,
)
# 导出给外部使用的类列表
__all__ = ['CSPLayerWithTwoConv',
'MaxSigmoidAttnBlock',
'MaxSigmoidCSPLayerWithTwoConv',
'ImagePoolingAttentionModule']
.YOLO-Worldyolo_worldmodelslossesdynamic_loss.py
代码语言:javascript
复制# 导入必要的库
from typing import Optional
import torch
import torch.nn as nn
from torch import Tensor
from mmdet.models.losses.mse_loss import mse_loss
from mmyolo.registry import MODELS
# 注册模型类为CoVMSELoss
@MODELS.register_module()
class CoVMSELoss(nn.Module):
def __init__(self,
dim: int = 0,
reduction: str = 'mean',
loss_weight: float = 1.0,
eps: float = 1e-6) -> None:
super().__init__()
# 初始化参数
self.dim = dim
self.reduction = reduction
self.loss_weight = loss_weight
self.eps = eps
def forward(self,
pred: Tensor,
weight: Optional[Tensor] = None,
avg_factor: Optional[int] = None,
reduction_override: Optional[str] = None) -> Tensor:
"""Forward function of loss."""
# 确保重写的减少参数在合法范围内
assert reduction_override in (None, 'none', 'mean', 'sum')
# 根据重写的减少参数或者默认减少参数来确定减少方式
reduction = (
reduction_override if reduction_override else self.reduction)
# 计算协方差
cov = pred.std(self.dim) / pred.mean(self.dim).clamp(min=self.eps)
# 创建目标张量
target = torch.zeros_like(cov)
# 计算损失
loss = self.loss_weight * mse_loss(
cov, target, weight, reduction=reduction, avg_factor=avg_factor)
return loss
.YOLO-Worldyolo_worldmodelslosses__init__.py
代码语言:javascript
复制# 版权声明,版权归腾讯公司所有
# 导入动态损失模块中的CoVMSELoss类
from .dynamic_loss import CoVMSELoss
# 导出CoVMSELoss类,供外部使用
__all__ = ['CoVMSELoss']
.YOLO-Worldyolo_worldmodelsnecksyolo_world_pafpn.py
代码语言:javascript
复制# 导入必要的库
import copy
from typing import List, Union
import torch
import torch.nn as nn
from torch import Tensor
from mmdet.utils import ConfigType, OptMultiConfig
# 导入自定义的模型注册器和工具函数
from mmyolo.registry import MODELS
from mmyolo.models.utils import make_divisible, make_round
from mmyolo.models.necks.yolov8_pafpn import YOLOv8PAFPN
# 注册YOLOWorldPAFPN类为模型
@MODELS.register_module()
class YOLOWorldPAFPN(YOLOv8PAFPN):
"""Path Aggregation Network used in YOLO World
Following YOLOv8 PAFPN, including text to image fusion
"""
# 初始化函数,定义模型结构和参数
def __init__(self,
in_channels: List[int],
out_channels: Union[List[int], int],
guide_channels: int,
embed_channels: List[int],
num_heads: List[int],
deepen_factor: float = 1.0,
widen_factor: float = 1.0,
num_csp_blocks: int = 3,
freeze_all: bool = False,
block_cfg: ConfigType = dict(type='CSPLayerWithTwoConv'),
norm_cfg: ConfigType = dict(type='BN',
momentum=0.03,
eps=0.001),
act_cfg: ConfigType = dict(type='SiLU', inplace=True),
init_cfg: OptMultiConfig = None) -> None:
# 设置引导通道数、嵌入通道数和头数
self.guide_channels = guide_channels
self.embed_channels = embed_channels
self.num_heads = num_heads
self.block_cfg = block_cfg
# 调用父类的初始化函数,传入参数
super().__init__(in_channels=in_channels,
out_channels=out_channels,
deepen_factor=deepen_factor,
widen_factor=widen_factor,
num_csp_blocks=num_csp_blocks,
freeze_all=freeze_all,
norm_cfg=norm_cfg,
act_cfg=act_cfg,
init_cfg=init_cfg)
# 构建自顶向下的层
def build_top_down_layer(self, idx: int) -> nn.Module:
"""build top down layer.
Args:
idx (int): layer idx.
Returns:
nn.Module: The top down layer.
"""
# 深拷贝块配置
block_cfg = copy.deepcopy(self.block_cfg)
# 更新块配置参数
block_cfg.update(
dict(in_channels=make_divisible(
(self.in_channels[idx - 1] self.in_channels[idx]),
self.widen_factor),
out_channels=make_divisible(self.out_channels[idx - 1],
self.widen_factor),
guide_channels=self.guide_channels,
embed_channels=make_round(self.embed_channels[idx - 1],
self.widen_factor),
num_heads=make_round(self.num_heads[idx - 1],
self.widen_factor),
num_blocks=make_round(self.num_csp_blocks,
self.deepen_factor),
add_identity=False,
norm_cfg=self.norm_cfg,
act_cfg=self.act_cfg))
# 构建模型
return MODELS.build(block_cfg)
# 构建底部向上的层
def build_bottom_up_layer(self, idx: int) -> nn.Module:
"""build bottom up layer.
Args:
idx (int): layer idx.
Returns:
nn.Module: The bottom up layer.
"""
# 深拷贝块配置
block_cfg = copy.deepcopy(self.block_cfg)
# 更新块配置
block_cfg.update(
dict(in_channels=make_divisible(
(self.out_channels[idx] self.out_channels[idx 1]),
self.widen_factor),
out_channels=make_divisible(self.out_channels[idx 1],
self.widen_factor),
guide_channels=self.guide_channels,
embed_channels=make_round(self.embed_channels[idx 1],
self.widen_factor),
num_heads=make_round(self.num_heads[idx 1],
self.widen_factor),
num_blocks=make_round(self.num_csp_blocks,
self.deepen_factor),
add_identity=False,
norm_cfg=self.norm_cfg,
act_cfg=self.act_cfg))
# 构建模型
return MODELS.build(block_cfg)
# 定义前向传播函数,接受多层级的图像特征和文本特征作为输入,返回元组
def forward(self, img_feats: List[Tensor], txt_feats: Tensor) -> tuple:
"""Forward function.
including multi-level image features, text features: BxLxD
"""
# 断言图像特征的数量与输入通道数相同
assert len(img_feats) == len(self.in_channels)
# 减少层级
reduce_outs = []
for idx in range(len(self.in_channels)):
reduce_outs.append(self.reduce_layers[idx](img_feats[idx]))
# 自顶向下路径
inner_outs = [reduce_outs[-1]]
for idx in range(len(self.in_channels) - 1, 0, -1):
feat_high = inner_outs[0]
feat_low = reduce_outs[idx - 1]
upsample_feat = self.upsample_layers[len(self.in_channels) - 1 - idx](feat_high)
if self.upsample_feats_cat_first:
top_down_layer_inputs = torch.cat([upsample_feat, feat_low], 1)
else:
top_down_layer_inputs = torch.cat([feat_low, upsample_feat], 1)
inner_out = self.top_down_layers[len(self.in_channels) - 1 - idx](top_down_layer_inputs, txt_feats)
inner_outs.insert(0, inner_out)
# 自底向上路径
outs = [inner_outs[0]]
for idx in range(len(self.in_channels) - 1):
feat_low = outs[-1]
feat_high = inner_outs[idx 1]
downsample_feat = self.downsample_layers[idx](feat_low)
out = self.bottom_up_layers[idx](torch.cat([downsample_feat, feat_high], 1), txt_feats)
outs.append(out)
# 输出层
results = []
for idx in range(len(self.in_channels)):
results.append(self.out_layers[idx](outs[idx]))
return tuple(results)
# 使用 @MODELS 注册 YOLOWorldDualPAFPN 类
@MODELS.register_module()
# 定义 YOLOWorldDualPAFPN 类,继承自 YOLOWorldPAFPN 类
class YOLOWorldDualPAFPN(YOLOWorldPAFPN):
"""Path Aggregation Network used in YOLO World v8."""
# 初始化函数,接受多个参数
def __init__(self,
in_channels: List[int], # 输入通道列表
out_channels: Union[List[int], int], # 输出通道列表或整数
guide_channels: int, # 引导通道数
embed_channels: List[int], # 嵌入通道列表
num_heads: List[int], # 多头注意力机制的头数列表
deepen_factor: float = 1.0, # 加深因子,默认为1.0
widen_factor: float = 1.0, # 扩宽因子,默认为1.0
num_csp_blocks: int = 3, # CSP块的数量,默认为3
freeze_all: bool = False, # 是否冻结所有层,默认为False
text_enhancder: ConfigType = dict( # 文本增强器配置
type='ImagePoolingAttentionModule', # 类型为图像池化注意力模块
embed_channels=256, # 嵌入通道数为256
num_heads=8, # 多头注意力机制的头数为8
pool_size=3), # 池化大小为3
block_cfg: ConfigType = dict(type='CSPLayerWithTwoConv'), # 块配置,默认为CSPLayerWithTwoConv
norm_cfg: ConfigType = dict(type='BN', # 归一化配置,默认为BN
momentum=0.03, # 动量为0.03
eps=0.001), # epsilon为0.001
act_cfg: ConfigType = dict(type='SiLU', inplace=True), # 激活函数配置,默认为SiLU
init_cfg: OptMultiConfig = None) -> None: # 初始化配置,默认为None,返回None
# 调用父类的初始化函数
super().__init__(in_channels=in_channels,
out_channels=out_channels,
guide_channels=guide_channels,
embed_channels=embed_channels,
num_heads=num_heads,
deepen_factor=deepen_factor,
widen_factor=widen_factor,
num_csp_blocks=num_csp_blocks,
freeze_all=freeze_all,
block_cfg=block_cfg,
norm_cfg=norm_cfg,
act_cfg=act_cfg,
init_cfg=init_cfg)
# 更新文本增强器配置
text_enhancder.update(
dict(
image_channels=[int(x * widen_factor) for x in out_channels], # 图像通道数根据输出通道和扩宽因子计算
text_channels=guide_channels, # 文本通道数为引导通道数
num_feats=len(out_channels), # 特征数量为输出通道数的长度
))
# 打印文本增强器配置
print(text_enhancder)
# 构建文本增强器模型
self.text_enhancer = MODELS.build(text_enhancder)
# 定义前向传播函数,接受图像特征列表和文本特征作为输入,返回元组
def forward(self, img_feats: List[Tensor], txt_feats: Tensor) -> tuple:
"""Forward function."""
# 断言图像特征列表的长度与输入通道数相同
assert len(img_feats) == len(self.in_channels)
# 减少层
reduce_outs = []
for idx in range(len(self.in_channels)):
reduce_outs.append(self.reduce_layers[idx](img_feats[idx]))
# 自顶向下路径
inner_outs = [reduce_outs[-1]]
for idx in range(len(self.in_channels) - 1, 0, -1):
feat_high = inner_outs[0]
feat_low = reduce_outs[idx - 1]
upsample_feat = self.upsample_layers[len(self.in_channels) - 1 - idx](feat_high)
if self.upsample_feats_cat_first:
top_down_layer_inputs = torch.cat([upsample_feat, feat_low], 1)
else:
top_down_layer_inputs = torch.cat([feat_low, upsample_feat], 1)
inner_out = self.top_down_layers[len(self.in_channels) - 1 - idx](top_down_layer_inputs, txt_feats)
inner_outs.insert(0, inner_out)
# 对文本特征进行增强
txt_feats = self.text_enhancer(txt_feats, inner_outs)
# 自底向上路径
outs = [inner_outs[0]]
for idx in range(len(self.in_channels) - 1):
feat_low = outs[-1]
feat_high = inner_outs[idx 1]
downsample_feat = self.downsample_layers[idx](feat_low)
out = self.bottom_up_layers[idx](torch.cat([downsample_feat, feat_high], 1), txt_feats)
outs.append(out)
# 输出层
results = []
for idx in range(len(self.in_channels)):
results.append(self.out_layers[idx](outs[idx]))
return tuple(results)
.YOLO-Worldyolo_worldmodelsnecks__init__.py
代码语言:javascript
复制# 版权声明,版权归腾讯公司所有
# 导入yolo_world_pafpn模块中的YOLOWorldPAFPN和YOLOWorldDualPAFPN类
from .yolo_world_pafpn import YOLOWorldPAFPN, YOLOWorldDualPAFPN
# 定义__all__列表,包含YOLOWorldPAFPN和YOLOWorldDualPAFPN类,用于模块导入时指定可导入的内容
__all__ = ['YOLOWorldPAFPN', 'YOLOWorldDualPAFPN']
.YOLO-Worldyolo_worldmodels__init__.py
代码语言:javascript
复制# 导入 Tencent 公司所有权的代码库
# 从 backbones 模块中导入所有内容
from .backbones import * # noqa
# 从 layers 模块中导入所有内容
from .layers import * # noqa
# 从 detectors 模块中导入所有内容
from .detectors import * # noqa
# 从 losses 模块中导入所有内容
from .losses import * # noqa
# 从 data_preprocessors 模块中导入所有内容
from .data_preprocessors import * # noqa
# 从 dense_heads 模块中导入所有内容
from .dense_heads import * # noqa
# 从 necks 模块中导入所有内容
from .necks import * # noqa
.YOLO-Worldyolo_worldversion.py
代码语言:javascript
复制# 版权声明
# 版权所有 © 腾讯公司
# 定义版本号
__version__ = '0.1.0'
# 解析版本信息的函数
def parse_version_info(version_str):
"""Parse a version string into a tuple.
Args:
version_str (str): The version string.
Returns:
tuple[int | str]: The version info, e.g., "1.3.0" is parsed into
(1, 3, 0), and "2.0.0rc1" is parsed into (2, 0, 0, 'rc1').
"""
# 初始化版本信息列表
version_info = []
# 根据 '.' 分割版本号字符串
for x in version_str.split('.'):
# 如果是数字,则转换为整数
if x.isdigit():
version_info.append(int(x))
# 如果包含 'rc',则分割出补丁版本号
elif x.find('rc') != -1:
patch_version = x.split('rc')
version_info.append(int(patch_version[0]))
version_info.append(f'rc{patch_version[1]}')
# 返回版本信息元组
return tuple(version_info)
# 调用解析版本信息函数,得到版本信息元组
version_info = parse_version_info(__version__)
# 导出的变量列表
__all__ = ['__version__', 'version_info', 'parse_version_info']
.YOLO-Worldyolo_world__init__.py
代码语言:javascript
复制# 导入当前目录下的 models 模块中的所有内容
from .models import * # noqa
# 导入当前目录下的 datasets 模块中的所有内容
from .datasets import * # noqa
# 导入当前目录下的 engine 模块中的所有内容
from .engine import * # noqa