Learning Day 30: IMDB comment classification with LSTM in Pytorch

De Jun Huang

Published in

dejunhuang

2 min readMay 15, 2021

IMDB dataset

TEXT: the actual comments. Eg. I like this move in the aspects of ….
LABEL: positive or negative

LSTM implementation:

The model part is easy to understand except for bidirectional=True, dropout=0.5
The dataset set-up part is confusing

import torch
from torch import nn, optim
from torchtext.legacy import data
from torchtext.legacy import datasets
import numpy as np

torch.manual_seed(123)

TEXT = data.Field(tokenize='spacy')
LABEL = data.LabelField(dtype=torch.float)
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

print(f"len of train: {len(train_data)}")
print(f"len of test: {len(test_data)}")

print(train_data.examples[15].text)
print(train_data.examples[15].label)

TEXT.build_vocab(train_data, max_size=10000, vectors='glove.6B.100d')
LABEL.build_vocab(train_data)

batch = 30
device = torch.device('cuda')
train_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, test_data),
    batch_size=batch,
    device=device
)


class LSTM(nn.Module):

    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(LSTM, self).__init__()

        # [0-10001] => [100]
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # [100] => [256]
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=2,
                            bidirectional=True, dropout=0.5)

        #
        self.fc = nn.Linear(hidden_dim * 2, 1)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        # x: [seq, b, 1]

        # [seq, b, 1] => [seq, b, 100]
        embedding = self.dropout(self.embedding(x))

        # output: [seq, b, hid_dim*2]
        # h: [num_layers*2, b, hid_dim]
        # c: [num_layers*2, b, hid_dim]
        output, (h, c) = self.lstm(embedding)

        # [num_layers*2, b, hid_dim] =>2 of [b, hid_dim] => [b, hid_dim*2]
        h = torch.cat([h[-2], h[-1]], dim=1)

        # [b, hid_dim*2] => [b, 1]
        h = self.dropout(h)
        out = self.fc(h)

        return out


model = LSTM(len(TEXT.vocab), 100, 256)

pretrained_embedding = TEXT.vocab.vectors
print(f"pretrained_embedding: {pretrained_embedding.shape}")
model.embedding.weight.data.copy_(pretrained_embedding)

optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss().to(device)
model.to(device)


def binary_acc(preds, y):
    preds = torch.round(torch.sigmoid(preds))
    correct = torch.eq(preds, y).float()
    acc = correct.sum() / len(correct)
    return acc


def train(model, iterator, optimizer, criterion):
    avg_acc = []
    model.train()

    for i, batch in enumerate(iterator):

        # [seq, b] => [b, 1] => [b]
        pred = model(batch.text).squeeze(1)

        loss = criterion(pred, batch.label)
        acc = binary_acc(pred, batch.label).item()
        avg_acc.append(acc)

        model.zero_grad()
        loss.backward()
        optimizer.step()

        if i % 10 == 0:
            print(i, acc)

    avg_acc = np.array(avg_acc).mean()
    print(f"avg acc: {avg_acc}")


def eval(model, iterator, criterion):
    avg_acc = []

    model.eval()

    with torch.no_grad():
        for batch in iterator:
            # [b, 1] => [b]
            pred = model(batch.text).squeeze(1)

            # loss = criterion(pred, batch.label)

            acc = binary_acc(pred, batch.label).item()
            avg_acc.append(acc)

    avg_acc = np.array(avg_acc).mean()

    print(f"test: {avg_acc}")


for epoch in range(10):
    eval(model, test_iterator, criterion)
    train(model, train_iterator, optimizer, criterion)

Questions

Regarding dataset set-up:

What is the logic behind data.Field and data.LabelField?
What is the use of tokenizer here data.Field(tokenize=’spacy’)
Why dataset is already transformed using GloVe, LSTM still needs another embedding operation?

Regarding model:

Why output shape = [seq, b, hid_dim*2] with bidirectional=True
Why h, c shape = [num_layers*2, b, hid_dim] with bidirectional=True
Why need to concat h[-2] and h[-1] to form h, what do h[-2] and h[-1] represent?
What is the meaning of drop out for embedding and h ? embedding = self.dropout(self.embedding(x)) h = self.dropout(h)
Why is LSTM model here more computational expensive than CNN models? (possible answer, the image used previously was 28x28 or 32x32, the embedding_dim=100 here with hid_dim = 256)

Reference

link1

Learning Day 30: IMDB comment classification with LSTM in Pytorch

IMDB dataset

LSTM implementation:

Questions

Regarding dataset set-up:

Regarding model:

Reference

Written by De Jun Huang