Single Shot Detection (SSD) - Lite Version
Last updated on:a year ago
SSDLite is widely used to demonstrate a new backbone’s detection capability. It was first introduced in MobileNetV2.
SSDLite
All regular convolutions are replaced with separable convolutions (depthwise followed by $1\times 1$ projection). The first layer of SSDLite is attached to layer 15 (with an output stride of 16). The rest of the SSDLite layers are connected to the last layer (with an output stride of 32).
Codes
Mobilenetv2 forward: (be a feature extractor)
## MobileNetV2
def forward(self, x):
features = []
for i in range(14):
x = self.features[i](x)
features.append(x)
for i in range(14, len(self.features)):
x = self.features[i](x)
features.append(x)
for i in range(len(self.extras)):
x = self.extras[i](x)
features.append(x)
return tuple(features)
Detector framework: (construct the detection architecture)
class SSDDetector(nn.Module):
def __init__(self, cfg):
super().__init__()
self.cfg = cfg
self.backbone = build_backbone(cfg)
self.box_head = build_box_head(cfg)
def forward(self, images, targets=None):
features = self.backbone(images)
detections, detector_losses = self.box_head(features, targets)
if self.training:
return detector_losses
return detections
SSDBoxHead:
@registry.BOX_HEADS.register('SSDBoxHead')
class SSDBoxHead(nn.Module):
def __init__(self, cfg):
super().__init__()
self.cfg = cfg
self.predictor = make_box_predictor(cfg)
#
if self.cfg.MODEL.BOX_HEAD.LOSS == 'FocalLoss':
self.loss_evaluator = FocalLoss(0.25, 2)
else: # By default, we use MultiBoxLoss
self.loss_evaluator = MultiBoxLoss(neg_pos_ratio=cfg.MODEL.NEG_POS_RATIO)
self.post_processor = PostProcessor(cfg)
self.priors = None
def forward(self, features, targets=None):
cls_logits, bbox_pred = self.predictor(features)
if self.training:
return self._forward_train(cls_logits, bbox_pred, targets)
else:
return self._forward_test(cls_logits, bbox_pred)
def _forward_train(self, cls_logits, bbox_pred, targets):
gt_boxes, gt_labels = targets['boxes'], targets['labels']
reg_loss, cls_loss = self.loss_evaluator(cls_logits, bbox_pred, gt_labels, gt_boxes)
loss_dict = dict(
reg_loss=reg_loss,
cls_loss=cls_loss,
)
detections = (cls_logits, bbox_pred)
return detections, loss_dict
def _forward_test(self, cls_logits, bbox_pred):
if self.priors is None:
self.priors = PriorBox(self.cfg)().to(bbox_pred.device)
#
if self.cfg.MODEL.BOX_HEAD.LOSS == 'FocalLoss':
scores = cls_logits.sigmoid()
else:
scores = F.softmax(cls_logits, dim=2)
boxes = box_utils.convert_locations_to_boxes(
bbox_pred, self.priors, self.cfg.MODEL.CENTER_VARIANCE, self.cfg.MODEL.SIZE_VARIANCE
)
boxes = box_utils.center_form_to_corner_form(boxes)
detections = (scores, boxes)
detections = self.post_processor(detections)
return detections, {}
Depthwise separable convolution:
class SeparableConv2d(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding=0, onnx_compatible=False):
super().__init__()
ReLU = nn.ReLU if onnx_compatible else nn.ReLU6
self.conv = nn.Sequential(
nn.Conv2d(in_channels=in_channels, out_channels=in_channels, kernel_size=kernel_size,
groups=in_channels, stride=stride, padding=padding),
nn.BatchNorm2d(in_channels),
ReLU(),
nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=1),
)
def forward(self, x):
return self.conv(x)
BoxPredictor: (The box predictor makes use of depthwise separable convolution.)
class BoxPredictor(nn.Module):
def __init__(self, cfg):
super().__init__()
self.cfg = cfg
self.cls_headers = nn.ModuleList()
self.reg_headers = nn.ModuleList()
for level, (boxes_per_location, out_channels) in enumerate(zip(cfg.MODEL.PRIORS.BOXES_PER_LOCATION, cfg.MODEL.BACKBONE.OUT_CHANNELS)):
cls_head = self.cls_block(level, out_channels, boxes_per_location)
self.cls_headers.append(cls_head)
reg_head = self.reg_block(level, out_channels, boxes_per_location)
self.reg_headers.append(reg_head)
self.reset_parameters()
if self.cfg.MODEL.BOX_HEAD.LOSS == 'FocalLoss':
for cls_head in self.cls_headers:
for m in cls_head.modules():
if isinstance(m, nn.Conv2d):
m.apply(self.initialize_prior)
def cls_block(self, level, out_channels, boxes_per_location):
raise NotImplementedError
def reg_block(self, level, out_channels, boxes_per_location):
raise NotImplementedError
def reset_parameters(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.xavier_uniform_(m.weight)
nn.init.zeros_(m.bias)
def initialize_prior(self, layer):
pi = 0.01
b = - math.log((1 - pi) / pi)
nn.init.constant_(layer.bias, b)
nn.init.normal_(layer.weight, std=0.01)
def forward(self, features):
cls_logits = []
bbox_pred = []
for feature, cls_header, reg_header in zip(features, self.cls_headers, self.reg_headers):
cls_logits.append(cls_header(feature).permute(0, 2, 3, 1).contiguous())
bbox_pred.append(reg_header(feature).permute(0, 2, 3, 1).contiguous())
batch_size = features[0].shape[0]
cls_logits = torch.cat([c.view(c.shape[0], -1) for c in cls_logits], dim=1).view(batch_size, -1, self.cfg.MODEL.NUM_CLASSES)
bbox_pred = torch.cat([l.view(l.shape[0], -1) for l in bbox_pred], dim=1).view(batch_size, -1, 4)
return cls_logits, bbox_pred
@registry.BOX_PREDICTORS.register('SSDBoxPredictor')
class SSDBoxPredictor(BoxPredictor):
def cls_block(self, level, out_channels, boxes_per_location):
return nn.Conv2d(out_channels, boxes_per_location * self.cfg.MODEL.NUM_CLASSES, kernel_size=3, stride=1, padding=1)
def reg_block(self, level, out_channels, boxes_per_location):
return nn.Conv2d(out_channels, boxes_per_location * 4, kernel_size=3, stride=1, padding=1)
@registry.BOX_PREDICTORS.register('SSDLiteBoxPredictor')
class SSDLiteBoxPredictor(BoxPredictor):
def cls_block(self, level, out_channels, boxes_per_location):
num_levels = len(self.cfg.MODEL.BACKBONE.OUT_CHANNELS)
if level == num_levels - 1:
return nn.Conv2d(out_channels, boxes_per_location * self.cfg.MODEL.NUM_CLASSES, kernel_size=1)
return SeparableConv2d(out_channels, boxes_per_location * self.cfg.MODEL.NUM_CLASSES, kernel_size=3, stride=1, padding=1)
def reg_block(self, level, out_channels, boxes_per_location):
num_levels = len(self.cfg.MODEL.BACKBONE.OUT_CHANNELS)
if level == num_levels - 1:
return nn.Conv2d(out_channels, boxes_per_location * 4, kernel_size=1)
return SeparableConv2d(out_channels, boxes_per_location * 4, kernel_size=3, stride=1, padding=1)
Reference
[1] Sandler, M., Howard, A., Zhu, M., Zhmoginov, A. and Chen, L.C., 2018. Mobilenetv2: Inverted residuals and linear bottlenecks. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 4510-4520).
[2] Mehta, S. and Rastegari, M., 2021. Mobilevit: light-weight, general-purpose, and mobile-friendly vision transformer. arXiv preprint arXiv:2110.02178.
本博客所有文章除特别声明外,均采用 CC BY-SA 4.0 协议 ,转载请注明出处!