Luca Scotton
8 min readJun 25, 2019

--

I fixed the several errors in the code, some variable names have been slightly edited to better understanding and error fixing. In addition, I added the snippet for the last regression loss which hasn’t been provided by the author.

import numpy as np
import torch
import torch.nn as nn
import torchvision

# %%
image = torch.zeros((1, 3, 800, 800)).float()

bbox = torch.FloatTensor([[20, 30, 400, 500], [300, 400, 500, 600]]) # [y1, x1, y2, x2] format
labels = torch.LongTensor([6, 8]) # 0 represents background
sub_sample = 16
# %%
model = torchvision.models.vgg16(pretrained=True)
fe = list(model.features)
# %%
req_features = []
k = image.clone()
out_channels = None
for i in fe:
k = i(k)
if k.size()[2] < 800 // 16:
break
req_features.append(i)
out_channels = k.size()[1]

print(len(req_features)) # 30
print(out_channels) # 512

faster_rcnn_fe_extractor = nn.Sequential(*req_features)
out_map = faster_rcnn_fe_extractor(image)
print(out_map.size())

# %%


ratios = [0.5, 1, 2]
anchor_scales = [8, 16, 32]

anchor_base = np.zeros((len(ratios) * len(anchor_scales), 4), dtype=np.float32)

print(anchor_base)

# %%
# test bounding boxes creation from single anchor
center_y = 50.
center_x = 120.

print(center_y, center_x)
# Out: (8, 8)

for i in range(len(ratios)):
print(f'ratio {ratios[i]}:')
for j in range(len(anchor_scales)):
h = sub_sample * anchor_scales[j] * np.sqrt(ratios[i])
w = sub_sample * anchor_scales[j] * np.sqrt(1. / ratios[i])

index = i * len(anchor_scales) + j

anchor_base[index, 0] = center_y - h / 2.
anchor_base[index, 1] = center_x - w / 2.
anchor_base[index, 2] = center_y + h / 2.
anchor_base[index, 3] = center_x + w / 2.
print(f'\tscale {anchor_scales[j]}: {anchor_base[index]}')
# %%
fe_size = 800 // 16
center_x = np.arange(16, (fe_size + 1) * 16, 16)
center_y = np.arange(16, (fe_size + 1) * 16, 16)

centers = np.zeros((len(center_x) * len(center_x), 2))

index = 0
for x in range(len(center_x)):
for y in range(len(center_y)):
centers[index, 0] = center_y[y] - 8
centers[index, 1] = center_x[x] - 8
index += 1

anchors = np.zeros((len(centers) * 9, 4), dtype=np.float32)

index = 0
for c in centers:
ctr_y, ctr_x = c
for i in range(len(ratios)):
for j in range(len(anchor_scales)):
h = sub_sample * anchor_scales[j] * np.sqrt(ratios[i])
w = sub_sample * anchor_scales[j] * np.sqrt(1. / ratios[i])

anchors[index, 0] = ctr_y - h / 2.
anchors[index, 1] = ctr_x - w / 2.
anchors[index, 2] = ctr_y + h / 2.
anchors[index, 3] = ctr_x + w / 2.
index += 1

print(anchors.shape)
print(anchors)

# %%

inside_indexes = np.where(
(anchors[:, 0] >= 0) &
(anchors[:, 1] >= 0) &
(anchors[:, 2] <= 800) &
(anchors[:, 3] <= 800)
)[0]
print(inside_indexes.shape)

bbox_labels = np.empty((len(inside_indexes),), dtype=np.int32)
bbox_labels.fill(-1)
print(bbox_labels.shape)

valid_anchor_boxes = anchors[inside_indexes]
print(valid_anchor_boxes.shape)
# %%

ious = np.empty((len(valid_anchor_boxes), 2), dtype=np.float32)
ious.fill(0)

for num1, i in enumerate(valid_anchor_boxes):
ya1, xa1, ya2, xa2 = i
anchor_area = (ya2 - ya1) * (xa2 - xa1)
for num2, j in enumerate(bbox):
yb1, xb1, yb2, xb2 = j
box_area = (yb2 - yb1) * (xb2 - xb1)

inter_x1 = max([xb1, xa1])
inter_y1 = max([yb1, ya1])
inter_x2 = min([xb2, xa2])
inter_y2 = min([yb2, ya2])

if (inter_x1 < inter_x2) and (inter_y1 < inter_y2):
iter_area = (inter_y2 - inter_y1) * (inter_x2 - inter_x1)
iou = iter_area / (anchor_area + box_area - iter_area)
else:
iou = 0.

ious[num1, num2] = iou
print(ious.shape)

# %%
# case 1
gt_argmax_ious = ious.argmax(axis=0)
print(gt_argmax_ious)

gt_max_ious = ious[gt_argmax_ious, np.arange(ious.shape[1])]
print(gt_max_ious)

# %%
# case 2

argmax_ious = ious.argmax(axis=1)
print(argmax_ious.shape)
print(argmax_ious)

max_ious = ious[np.arange(len(ious)), argmax_ious]
print(max_ious)

gt_argmax_ious = np.where(ious == gt_max_ious)[0]
print(gt_argmax_ious)

# %%

pos_iou_threshold = 0.7
neg_iou_threshold = 0.3

bbox_labels[max_ious < neg_iou_threshold] = 0
bbox_labels[gt_argmax_ious] = 1
bbox_labels[max_ious >= pos_iou_threshold] = 1

pos_ratio = 0.5
n_sample = 256
n_pos = pos_ratio * n_sample

pos_index = np.where(bbox_labels == 1)[0]

if len(pos_index) > n_pos:
disable_index = np.random.choice(pos_index, size=(len(pos_index) - n_pos), replace=False)
bbox_labels[disable_index] = -1
# %%
n_neg = n_sample - np.sum(bbox_labels == 1)
neg_index = np.where(bbox_labels == 0)[0]

if len(neg_index) > n_neg:
disable_index = np.random.choice(neg_index, size=(len(neg_index) - n_neg), replace=False)
bbox_labels[disable_index] = -1

# %%
max_iou_bbox = bbox[argmax_ious]
print(max_iou_bbox)

height = valid_anchor_boxes[:, 2] - valid_anchor_boxes[:, 0]
width = valid_anchor_boxes[:, 3] - valid_anchor_boxes[:, 1]
ctr_y = valid_anchor_boxes[:, 0] + 0.5 * height
ctr_x = valid_anchor_boxes[:, 1] + 0.5 * width

base_height = (max_iou_bbox[:, 2] - max_iou_bbox[:, 0]).cpu().numpy()
base_width = (max_iou_bbox[:, 3] - max_iou_bbox[:, 1]).cpu().numpy()
base_ctr_y = max_iou_bbox[:, 0].cpu().numpy() + 0.5 * base_height
base_ctr_x = max_iou_bbox[:, 1].cpu().numpy() + 0.5 * base_width

eps = np.finfo(height.dtype).eps
height = np.maximum(height, eps)
width = np.maximum(width, eps)

dy = (base_ctr_y - ctr_y) / height
dx = (base_ctr_x - ctr_x) / width
dh = np.log(base_height / height)
dw = np.log(base_width / width)

anchor_locs = np.vstack((dy, dx, dh, dw)).transpose()
print(anchor_locs)

# %%
# necessary because of the output of the net
anchor_labels = np.empty((len(anchors),), dtype=bbox_labels.dtype)
anchor_labels.fill(-1)
anchor_labels[inside_indexes] = bbox_labels

anchor_locations = np.empty((len(anchors),) + anchors.shape[1:], dtype=anchor_locs.dtype)
anchor_locations.fill(0)
anchor_locations[inside_indexes, :] = anchor_locs

# %%
import torch.nn as nn

mid_channels = 512
in_channels = 512 # depends on the output feature map. in vgg 16 it is equal to 512
n_anchor = len(ratios) * len(anchor_scales) # Number of anchors at each location
conv1 = nn.Conv2d(in_channels, mid_channels, 3, 1, 1)
reg_layer = nn.Conv2d(mid_channels, n_anchor * 4, 1, 1, 0)
cls_layer = nn.Conv2d(mid_channels, n_anchor * 2, 1, 1,
0) ## I will be going to use softmax here. you can equally use sigmoid if u replace 2 with 1.
# conv sliding layer
# initialization (as paper)
conv1.weight.data.normal_(0, 0.01)
conv1.bias.data.zero_() # Regression layer
reg_layer.weight.data.normal_(0, 0.01)
reg_layer.bias.data.zero_() # classification layer
cls_layer.weight.data.normal_(0, 0.01)
cls_layer.bias.data.zero_()

# %%
x = conv1(out_map) # out_map is obtained in section 1
pred_anchor_locs = reg_layer(x)
pred_cls_scores = cls_layer(x)
print(pred_cls_scores.shape, pred_anchor_locs.shape)

pred_anchor_locs = pred_anchor_locs.permute(0, 2, 3, 1).contiguous().view(1, -1, 4)
print(pred_anchor_locs.shape)

pred_cls_scores = pred_cls_scores.permute(0, 2, 3, 1).contiguous()
print(pred_cls_scores.shape)

objectness_score = pred_cls_scores.view(1, 50, 50, 9, 2)[:, :, :, :, 1].contiguous().view(1, -1)
print(objectness_score.shape)

pred_cls_scores = pred_cls_scores.view(1, -1, 2)
print(pred_cls_scores.shape)

# %% RoI network

nms_thresh = 0.7
n_train_pre_nms = 12000
n_train_post_nms = 2000
n_test_pre_nms = 6000
n_test_post_nms = 300
min_size = 16

# %% inverse transform from y1, x1, y2, x2 to ctr_x, ctr_y, h, w
anc_height = anchors[:, 2] - anchors[:, 0]
anc_width = anchors[:, 3] - anchors[:, 1]
anc_ctr_y = anchors[:, 0] + 0.5 * anc_height
anc_ctr_x = anchors[:, 1] + 0.5 * anc_width

pred_anchor_locs_numpy = pred_anchor_locs[0].data.numpy()
objectness_score_numpy = objectness_score[0].data.numpy()

dy = pred_anchor_locs_numpy[:, 0::4]
dx = pred_anchor_locs_numpy[:, 1::4]
dh = pred_anchor_locs_numpy[:, 2::4]
dw = pred_anchor_locs_numpy[:, 3::4]

ctr_y = dy * anc_height[:, np.newaxis] + anc_ctr_y[:, np.newaxis]
ctr_x = dx * anc_width[:, np.newaxis] + anc_ctr_x[:, np.newaxis]
h = np.exp(dh) * anc_height[:, np.newaxis]
w = np.exp(dw) * anc_width[:, np.newaxis]

# %% clip sides
roi = np.zeros(pred_anchor_locs_numpy.shape, dtype=anchor_locs.dtype)
roi[:, 0::4] = ctr_y - 0.5 * h
roi[:, 1::4] = ctr_x - 0.5 * w
roi[:, 2::4] = ctr_y + 0.5 * h
roi[:, 3::4] = ctr_x + 0.5 * w

img_size = (800, 800) # Image size
roi[:, slice(0, 4, 2)] = np.clip(roi[:, slice(0, 4, 2)], 0, img_size[0])
roi[:, slice(1, 4, 2)] = np.clip(roi[:, slice(1, 4, 2)], 0, img_size[1])
print(roi)
print(roi.shape)

# %% remove small ones
hs = roi[:, 2] - roi[:, 0]
ws = roi[:, 3] - roi[:, 1]
keep = np.where((hs >= min_size) & (ws >= min_size))[0]
roi = roi[keep, :]
scores = objectness_score_numpy[keep]
print(scores.shape)
print(roi.shape)

# %% sort and pick
ordered_scores = scores.ravel().argsort()[::-1]
print(ordered_scores)
ordered_scores = ordered_scores[:n_train_pre_nms]
roi = roi[ordered_scores, :]
print(roi.shape)
print(roi)

# %%

y1 = roi[:, 0]
x1 = roi[:, 1]
y2 = roi[:, 2]
x2 = roi[:, 3]

areas = (x2 - x1 + 1) * (y2 - y1 + 1)

order = ordered_scores.argsort()[::-1]
keep = []

while order.size > 0:
i = order[0]
keep.append(i)
xx1 = np.maximum(x1[i], x1[order[1:]])
yy1 = np.maximum(y1[i], y1[order[1:]])
xx2 = np.minimum(x2[i], x2[order[1:]])
yy2 = np.minimum(y2[i], y2[order[1:]])
w = np.maximum(0.0, xx2 - xx1 + 1)
h = np.maximum(0.0, yy2 - yy1 + 1)
inter = w * h
ovr = inter / (areas[i] + areas[order[1:]] - inter)
inds = np.where(ovr <= nms_thresh)[0]
order = order[inds + 1]
keep = keep[:n_train_post_nms] # while training/testing , use accordingly
roi = roi[keep] # the final region proposals for training

# %%

n_samples = 128
pos_ratio = 0.25
pos_iou_thresh = 0.5
neg_iou_thresh_hi = 0.5
neg_iou_thresh_lo = 0.0

ious = np.empty((len(roi), 2), dtype=np.float32)
ious.fill(0)
for num1, i in enumerate(roi):
ya1, xa1, ya2, xa2 = i
anchor_area = (ya2 - ya1) * (xa2 - xa1)
for num2, j in enumerate(bbox):
yb1, xb1, yb2, xb2 = j
box_area = (yb2 - yb1) * (xb2 - xb1)
inter_x1 = max([xb1, xa1])
inter_y1 = max([yb1, ya1])
inter_x2 = min([xb2, xa2])
inter_y2 = min([yb2, ya2])
if (inter_x1 < inter_x2) and (inter_y1 < inter_y2):
iter_area = (inter_y2 - inter_y1) * (inter_x2 - inter_x1)
iou = iter_area / (anchor_area + box_area - iter_area)
else:
iou = 0.
ious[num1, num2] = iou
print(ious.shape)

# %%

gt_assignment = ious.argmax(axis=1)
max_ious = ious.max(axis=1)
print(gt_assignment)
print(max_ious)

gt_roi_label = labels[gt_assignment]
print(gt_roi_label)

# %%
pos_roi_per_image = int(n_samples * pos_ratio)
pos_index = np.where(max_ious >= pos_iou_thresh)[0]
pos_roi_per_this_image = int(min(pos_roi_per_image, pos_index.size))
if pos_index.size > 0:
pos_index = np.random.choice(
pos_index, size=pos_roi_per_this_image, replace=False)
print(pos_roi_per_this_image)
print(pos_index)

# %%
neg_index = np.where((max_ious < neg_iou_thresh_hi) &
(max_ious >= neg_iou_thresh_lo))[0]
neg_roi_per_this_image = n_sample - pos_roi_per_this_image
neg_roi_per_this_image = int(min(neg_roi_per_this_image,
neg_index.size))
if neg_index.size > 0:
neg_index = np.random.choice(
neg_index, size=neg_roi_per_this_image, replace=False)
print(neg_roi_per_this_image)
print(neg_index)

# %%
keep_index = np.append(pos_index, neg_index)
gt_roi_labels = gt_roi_label[keep_index]
gt_roi_labels[pos_roi_per_this_image:] = 0 # negative labels --> 0
sample_roi = roi[keep_index]
print(sample_roi.shape)

# %%
bbox_for_sampled_roi = bbox[gt_assignment[keep_index]]
print(bbox_for_sampled_roi.shape)

height = sample_roi[:, 2] - sample_roi[:, 0]
width = sample_roi[:, 3] - sample_roi[:, 1]
ctr_y = sample_roi[:, 0] + 0.5 * height
ctr_x = sample_roi[:, 1] + 0.5 * width
base_height = bbox_for_sampled_roi[:, 2] - bbox_for_sampled_roi[:, 0]
base_width = bbox_for_sampled_roi[:, 3] - bbox_for_sampled_roi[:, 1]
base_ctr_y = (bbox_for_sampled_roi[:, 0] + 0.5 * base_height).cpu().numpy()
base_ctr_x = (bbox_for_sampled_roi[:, 1] + 0.5 * base_width).cpu().numpy()

eps = np.finfo(height.dtype).eps
height = np.maximum(height, eps)
width = np.maximum(width, eps)
dy = (base_ctr_y - ctr_y) / height
dx = (base_ctr_x - ctr_x) / width
dh = np.log(base_height.cpu().numpy() / height)
dw = np.log(base_width.cpu().numpy() / width)
gt_roi_locs = np.vstack((dy, dx, dh, dw)).transpose()
print(gt_roi_locs)

# %% FAST RCNN
rois = torch.from_numpy(sample_roi).float()
roi_indices = 0 * np.ones((len(rois),), dtype=np.int32)
roi_indices = torch.from_numpy(roi_indices).float()
print(rois.shape, roi_indices.shape)

# %%
indices_and_rois = torch.cat([roi_indices[:, None], rois], dim=1)
xy_indices_and_rois = indices_and_rois[:, [0, 2, 1, 4, 3]]
indices_and_rois = xy_indices_and_rois.contiguous()
print(xy_indices_and_rois.shape)

# %%
size = 7 # max pool 7x7
adaptive_max_pool = nn.AdaptiveMaxPool2d(size)
output = []
rois = indices_and_rois.data.float()
rois[:, 1:].mul_(1 / 16.0) # Subsampling ratio skipping the index
rois = rois.long()
num_rois = rois.size(0)
for i in range(num_rois):
roi = rois[i]
im_idx = roi[0]
im = out_map.narrow(0, im_idx, 1)[..., roi[2]:(roi[4] + 1), roi[1]:(roi[3] + 1)]
output.append(adaptive_max_pool(im))
output = torch.cat(output, 0)
print(output.size())
# Reshape the tensor so that we can pass it through the feed forward layer.
k = output.view(output.size(0), -1)
print(k.shape)

# %%

roi_head_classifier = nn.Sequential(*[nn.Linear(25088, 4096),
nn.Linear(4096, 4096)])
cls_loc = nn.Linear(4096, 21 * 4) # (VOC 20 classes + 1 background. Each will have 4 co-ordinates)cls_loc.weight.data.normal_(0, 0.01)
cls_loc.bias.data.zero_()
score = nn.Linear(4096, 21) # (VOC 20 classes + 1 background)

k = roi_head_classifier(k)
roi_cls_loc = cls_loc(k)
roi_cls_score = score(k)
print(roi_cls_loc.shape, roi_cls_score.shape)

print(pred_anchor_locs.shape)
print(pred_cls_scores.shape)
print(anchor_locations.shape)
print(anchor_labels.shape)
# %%
rpn_loc = pred_anchor_locs[0]
rpn_score = pred_cls_scores[0]
gt_rpn_loc = torch.from_numpy(anchor_locations)
gt_rpn_score = torch.from_numpy(anchor_labels)
print(rpn_loc.shape, rpn_score.shape, gt_rpn_loc.shape, gt_rpn_score.shape)
# %%
import torch.nn.functional as F

rpn_cls_loss = F.cross_entropy(rpn_score, gt_rpn_score.long(), ignore_index=-1)
print(rpn_cls_loss)

pos = gt_rpn_score > 0
mask = pos.unsqueeze(1).expand_as(rpn_loc)
print(mask.shape)
# %%
mask_loc_preds = rpn_loc[mask].view(-1, 4)
mask_loc_targets = gt_rpn_loc[mask].view(-1, 4)
print(mask_loc_preds.shape, mask_loc_preds.shape)
# %%
x = torch.abs(mask_loc_targets - mask_loc_preds)
rpn_loc_loss = ((x < 1).float() * 0.5 * x**2) + ((x >= 1).float() * (x-0.5))
print(rpn_loc_loss.sum())

# %% RPN LOSS
rpn_lambda = 10.
N_reg = (gt_rpn_score > 0).float().sum()
rpn_loc_loss = rpn_loc_loss.sum() / N_reg
rpn_loss = rpn_cls_loss + (rpn_lambda * rpn_loc_loss)
print(rpn_loss)


# %% FAST RCNN LOSS

print(roi_cls_loc.shape)
print(roi_cls_score.shape)
print(gt_roi_locs.shape)
print(gt_roi_labels.shape)

gt_roi_loc = torch.from_numpy(gt_roi_locs)
gt_roi_label = torch.from_numpy(np.float32(gt_roi_labels)).long()
print(gt_roi_loc.shape, gt_roi_label.shape)

# %% classification loss
roi_cls_loss = F.cross_entropy(roi_cls_score, gt_roi_label, ignore_index=-1)
print(roi_cls_loss)
# %% regression loss
n_sample = roi_cls_loc.shape[0]
roi_loc = roi_cls_loc.view(n_sample, -1, 4)
print(roi_loc.shape)
roi_loc = roi_loc[torch.arange(0, n_sample).long(), gt_roi_label]
print(roi_loc.shape)
x_roi = torch.abs(gt_roi_loc - roi_loc)
roi_loc_loss = ((x_roi < 1).float() * 0.5 * x_roi ** 2) + ((x_roi >= 1).float() * (x_roi - 0.5))
print(roi_loc_loss.sum())

# %% total loss
roi_lambda = 10.
N_reg_roi = (gt_rpn_score > 0).float().sum()
roi_loc_loss = roi_loc_loss.sum() / N_reg_roi
roi_loss = roi_cls_loss + (roi_lambda * roi_loc_loss)
print(roi_loss)

total_loss = rpn_loss + roi_loss
print(total_loss)

--

--