TDS Archive

An archive of data science, data analytics, data engineering, machine learning, and artificial intelligence writing from the former Towards Data Science Medium publication.

Bird by Bird Tech

Bird by Bird using Deep Learning

17 min readJan 3, 2021

--

Image by author

Introducing the related work

Applications of deep learning for computer vision

Design principles of convolutional neural networks

Figure 1. LeCun’s LeNet-5, convolutional neural network model, composed of convolution and sub-sampling operations followed by the fully-connected layers that process the data extracted by the previous layers to form the final output (Adapted from [2])

Towards residual networks

Figure 2. Skip connection block of ResNets

Classification of bird species using ResNet

# import packages
import os
import csv
import numpy as np
import sklearn.model_selection as skms
import torch
import torch.utils.data as td
import torch.nn.functional as F
import torchvision as tv
import torchvision.transforms.functional as TF
# define constants
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
RANDOM_SEED = 42
IN_DIR_DATA = 'data/CUB_200_2011'
IN_DIR_IMG = os.path.join(IN_DIR_DATA, 'images')

Discovering the optimal data transformation strategy

Figure 3. Hard-to-distinguish birds of the same family: White-throated Sparrow vs. Lincoln Sparrow (CUB-200‑2011), vs. Captain Jack Sparrow (Public domain)
# calculate the number of sparrow species
cls_sparrows = [k for k in os.listdir(IN_DIR_IMG) if 'sparrow' in k.lower()]
print(len(cls_sparrows))
Figure 4. Variable distribution of sizes of bird images. Box reflect the interquartile range of the image size distribution (widths and heights) with a median quartile – line within the box – marking the mid-point of the data, whiskers representing observations greater and lesser of Q1 and Q3, respectively, as well as the outside points denoting the data that lie outside the overall distribution

Transforming images and splitting the data

class DatasetBirds(tv.datasets.ImageFolder):
"""
Wrapper for the CUB-200-2011 dataset.
Method DatasetBirds.__getitem__() returns tuple of image and its corresponding label.
"""
def __init__(self,
root,
transform=None,
target_transform=None,
loader=tv.datasets.folder.default_loader,
is_valid_file=None,
train=True,
bboxes=False):
img_root = os.path.join(root, 'images') super(DatasetBirds, self).__init__(
root=img_root,
transform=None,
target_transform=None,
loader=loader,
is_valid_file=is_valid_file,
)
self.transform_ = transform
self.target_transform_ = target_transform
self.train = train

# obtain sample ids filtered by split
path_to_splits = os.path.join(root, 'train_test_split.txt')
indices_to_use = list()
with open(path_to_splits, 'r') as in_file:
for line in in_file:
idx, use_train = line.strip('\n').split(' ', 2)
if bool(int(use_train)) == self.train:
indices_to_use.append(int(idx))
# obtain filenames of images
path_to_index = os.path.join(root, 'images.txt')
filenames_to_use = set()
with open(path_to_index, 'r') as in_file:
for line in in_file:
idx, fn = line.strip('\n').split(' ', 2)
if int(idx) in indices_to_use:
filenames_to_use.add(fn)
img_paths_cut = {'/'.join(img_path.rsplit('/', 2)[-2:]): idx for idx, (img_path, lb) in enumerate(self.imgs)}
imgs_to_use = [self.imgs[img_paths_cut[fn]] for fn in filenames_to_use]
_, targets_to_use = list(zip(*imgs_to_use)) self.imgs = self.samples = imgs_to_use
self.targets = targets_to_use
if bboxes:
# get coordinates of a bounding box
path_to_bboxes = os.path.join(root, 'bounding_boxes.txt')
bounding_boxes = list()
with open(path_to_bboxes, 'r') as in_file:
for line in in_file:
idx, x, y, w, h = map(lambda x: float(x), line.strip('\n').split(' '))
if int(idx) in indices_to_use:
bounding_boxes.append((x, y, w, h))
self.bboxes = bounding_boxes
else:
self.bboxes = None
def __getitem__(self, index):
# generate one sample
sample, target = super(DatasetBirds, self).__getitem__(index)
if self.bboxes is not None:
# squeeze coordinates of the bounding box to range [0, 1]
width, height = sample.width, sample.height
x, y, w, h = self.bboxes[index]
scale_resize = 500 / width
scale_resize_crop = scale_resize * (375 / 500)
x_rel = scale_resize_crop * x / 375
y_rel = scale_resize_crop * y / 375
w_rel = scale_resize_crop * w / 375
h_rel = scale_resize_crop * h / 375
target = torch.tensor([target, x_rel, y_rel, w_rel, h_rel]) if self.transform_ is not None:
sample = self.transform_(sample)
if self.target_transform_ is not None:
target = self.target_transform_(target)
return sample, target
def pad(img, size_max=500):
"""
Pads images to the specified size (height x width).
"""
pad_height = max(0, size_max - img.height)
pad_width = max(0, size_max - img.width)

pad_top = pad_height // 2
pad_bottom = pad_height - pad_top
pad_left = pad_width // 2
pad_right = pad_width - pad_left

return TF.pad(
img,
(pad_left, pad_top, pad_right, pad_bottom),
fill=tuple(map(lambda x: int(round(x * 256)), (0.485, 0.456, 0.406))))
# transform images
transforms_train = tv.transforms.Compose([
tv.transforms.Lambda(pad),
tv.transforms.RandomOrder([
tv.transforms.RandomCrop((375, 375)),
tv.transforms.RandomHorizontalFlip(),
tv.transforms.RandomVerticalFlip()
]),
tv.transforms.ToTensor(),
tv.transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])
transforms_eval = tv.transforms.Compose([
tv.transforms.Lambda(pad),
tv.transforms.CenterCrop((375, 375)),
tv.transforms.ToTensor(),
tv.transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])
# instantiate dataset objects according to the pre-defined splits
ds_train = DatasetBirds(IN_DIR_DATA, transform=transforms_train, train=True)
ds_val = DatasetBirds(IN_DIR_DATA, transform=transforms_eval, train=True)
ds_test = DatasetBirds(IN_DIR_DATA, transform=transforms_eval, train=False)
splits = skms.StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=RANDOM_SEED)
idx_train, idx_val = next(splits.split(np.zeros(len(ds_train)), ds_train.targets))
# set hyper-parameters
params = {'batch_size': 24, 'num_workers': 8}
num_epochs = 100
num_classes = 200
# instantiate data loaders
train_loader = td.DataLoader(
dataset=ds_train,
sampler=td.SubsetRandomSampler(idx_train),
**params
)
val_loader = td.DataLoader(
dataset=ds_val,
sampler=td.SubsetRandomSampler(idx_val),
**params
)
test_loader = td.DataLoader(dataset=ds_test, **params)

Building a baseline ResNet‑50 classifier

# instantiate the model
model = tv.models.resnet50(num_classes=num_classes).to(DEVICE)

Training and evaluating the model

# instantiate optimizer and scheduler
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.95)
# loop over epochs
for epoch in range(num_epochs):
# train the model
model.train()
train_loss = list()
train_acc = list()
for batch in train_loader:
x, y = batch

x = x.to(DEVICE)
y = y.to(DEVICE)

optimizer.zero_grad()
# predict bird species
y_pred = model(x)
# calculate the loss
loss = F.cross_entropy(y_pred, y)
# backprop & update weights
loss.backward()
optimizer.step()
# calculate the accuracy
acc = skms.accuracy_score([val.item() for val in y], [val.item() for val in y_pred.argmax(dim=-1)])

train_loss.append(loss.item())
train_acc.append(acc)

# validate the model
model.eval()
val_loss = list()
val_acc = list()
with torch.no_grad():
for batch in val_loader:
x, y = batch
x = x.to(DEVICE)
y = y.to(DEVICE)
# predict bird species
y_pred = model(x)

# calculate the loss
loss = F.cross_entropy(y_pred, y)
# calculate the accuracy
acc = skms.accuracy_score([val.item() for val in y], [val.item() for val in y_pred.argmax(dim=-1)])
val_loss.append(loss.item())
val_acc.append(acc)
# adjust the learning rate
scheduler.step()
# test the model
true = list()
pred = list()
with torch.no_grad():
for batch in test_loader:
x, y = batch
x = x.to(DEVICE)
y = y.to(DEVICE)
y_pred = model(x) true.extend([val.item() for val in y])
pred.extend([val.item() for val in y_pred.argmax(dim=-1)])
# calculate the accuracy
test_accuracy = skms.accuracy_score(true, pred)
print('Test accuracy: {:.3f}'.format(test_accuracy)
Figure 5. Cross-entropy loss and accuracy metric against the number of epochs of the baseline ResNet‑50

Advancing the deep learning model

How to deal with overfitting given the limited amount of training samples?

# instantiate the model
model = tv.models.resnet50(num_classes=200, pretrained=True).to(DEVICE)
Figure 6. Cross-entropy loss and accuracy metric against the number of epochs of the pre-trained ResNet‑50

How to improve the model performance in bird species recognition?

Solution 1: Multi-task learning

# instantiate the pre-trained model
model = tv.models.resnet50(num_classes=204, pretrained=True).to(DEVICE)
...y_pred = model(x)# predict bird species
y_pred_cls = y_pred[..., :-4]
y_cls = y[..., 0].long()
# predict bounding box coordinates
y_pred_bbox = y_pred[..., -4:]
y_bbox = y[..., 1:]
# calculate the loss
loss_cls = F.cross_entropy(y_pred_cls, y_cls)
loss_bbox = F.mse_loss(torch.sigmoid(y_pred_bbox), y_bbox)
loss = loss_cls + loss_bbox
...
Figure 7. Cross-entropy loss and accuracy metric against the number of epochs of the pre-trained ResNet‑50 enhanced with the auxiliary task

Solution 2: Attention-enhanced CNNs

class Attention(torch.nn.Module):
"""
Attention block for CNN model.
"""
def __init__(self, in_channels, out_channels, kernel_size, padding):
super(Attention, self).__init__()
self.conv_depth = torch.nn.Conv2d(in_channels, out_channels, kernel_size, padding=padding, groups=in_channels)
self.conv_point = torch.nn.Conv2d(out_channels, out_channels, kernel_size=(1, 1))
self.bn = torch.nn.BatchNorm2d(out_channels, eps=1e-5, momentum=0.1, affine=True, track_running_stats=True)
self.activation = torch.nn.Tanh()
def forward(self, inputs):
x, output_size = inputs
x = F.adaptive_max_pool2d(x, output_size=output_size)
x = self.conv_depth(x)
x = self.conv_point(x)
x = self.bn(x)
x = self.activation(x) + 1.0
return x
class ResNet50Attention(torch.nn.Module):
"""
Attention-enhanced ResNet-50 model.
"""
weights_loader = staticmethod(tv.models.resnet50) def __init__(self, num_classes=200, pretrained=True, use_attention=True):
super(ResNet50Attention, self).__init__()
net = self.weights_loader(pretrained=pretrained)
self.num_classes = num_classes
self.pretrained = pretrained
self.use_attention = use_attention
net.fc = torch.nn.Linear(
in_features=net.fc.in_features,
out_features=num_classes,
bias=net.fc.bias is not None
)
self.net = net if self.use_attention:
self.att1 = Attention(in_channels=64, out_channels=64, kernel_size=(3, 5), padding=(1, 2))
self.att2 = Attention(in_channels=64, out_channels=128, kernel_size=(5, 3), padding=(2, 1))
self.att3 = Attention(in_channels=128, out_channels=256, kernel_size=(3, 5), padding=(1, 2))
self.att4 = Attention(in_channels=256, out_channels=512, kernel_size=(5, 3), padding=(2, 1))
if pretrained:
self.att1.bn.weight.data.zero_()
self.att1.bn.bias.data.zero_()
self.att2.bn.weight.data.zero_()
self.att2.bn.bias.data.zero_()
self.att3.bn.weight.data.zero_()
self.att3.bn.bias.data.zero_()
self.att4.bn.weight.data.zero_()
self.att4.bn.bias.data.zero_()
def _forward(self, x):
return self.net(x)

def _forward_att(self, x):
x = self.net.conv1(x)
x = self.net.bn1(x)
x = self.net.relu(x)
x = self.net.maxpool(x)
x_a = x.clone()
x = self.net.layer1(x)
x = x * self.att1((x_a, x.shape[-2:]))
x_a = x.clone()
x = self.net.layer2(x)
x = x * self.att2((x_a, x.shape[-2:]))
x_a = x.clone()
x = self.net.layer3(x)
x = x * self.att3((x_a, x.shape[-2:]))
x_a = x.clone()
x = self.net.layer4(x)
x = x * self.att4((x_a, x.shape[-2:]))
x = self.net.avgpool(x)
x = torch.flatten(x, 1)
x = self.net.fc(x)
return x

def forward(self, x):
return self._forward_att(x) if self.use_attention else self._forward(x)
# instantiate the model
model = ResNet50Attention(num_classes=204, pretrained=True, use_attention=True).to(DEVICE)
Figure 8. Comparison of the performance of ResNet‑50 advanced using different techniques

Conclusions

More coming soon!

References

--

--

TDS Archive
TDS Archive

Published in TDS Archive

An archive of data science, data analytics, data engineering, machine learning, and artificial intelligence writing from the former Towards Data Science Medium publication.

Sofya Lipnitskaya
Sofya Lipnitskaya

Responses (3)