(ML) TIMIT framewise phoneme classification -ML2021Spring

YEN HUNG CHENG

5 min readOct 21, 2023

Baseline

simple -> 0.68561

strong -> 0.76118

Hints

Simple Baseline

✅Run sample code

Strong Baseline

✅Model architecture (layers? dimension? activation function?)

✅Training (batch size? optimizer? learning rate? epoch?)

✅Tips (batch norm? dropout? regularization?)

Download Data

notes: if the google drive link is dead, you can download the data directly from Kaggle and upload it to the workspace.

!gdown --id '1HPkcmQmFGu-3OknddKIa5dNDsR05lIQR' --output data.zip
!unzip data.zip
!ls

Preparing Data

Load the training and testing data from the .npy file (NumPy array).

import numpy as np

print('Loading data ...')

data_root='./timit_11/'
train = np.load(data_root + 'train_11.npy')
train_label = np.load(data_root + 'train_label_11.npy')
test = np.load(data_root + 'test_11.npy')

print('Size of training data: {}'.format(train.shape))
print('Size of testing data: {}'.format(test.shape))

Create Dataset

import torch
from torch.utils.data import Dataset

class TIMITDataset(Dataset):
    def __init__(self, X, y=None):
        self.data = torch.from_numpy(X).float()
        if y is not None:
            y = y.astype(np.int)
            self.label = torch.LongTensor(y)
        else:
            self.label = None

    def __getitem__(self, idx):
        if self.label is not None:
            return self.data[idx], self.label[idx]
        else:
            return self.data[idx]

    def __len__(self):
        return len(self.data)

Split the labeled data into a training set and a validation set, you can modify the variable VAL_RATIO to change the ratio of validation data.

VAL_RATIO = 0.2

percent = int(train.shape[0] * (1 - VAL_RATIO))
train_x, train_y, val_x, val_y = train[:percent], train_label[:percent], train[percent:], train_label[percent:]
print('Size of training set: {}'.format(train_x.shape))
print('Size of validation set: {}'.format(val_x.shape))

Create a data loader from the dataset, feel free to tweak the variable BATCH_SIZE here.

BATCH_SIZE = 64

from torch.utils.data import DataLoader

train_set = TIMITDataset(train_x, train_y)
val_set = TIMITDataset(val_x, val_y)
train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True) #only shuffle the training data
val_loader = DataLoader(val_set, batch_size=BATCH_SIZE, shuffle=False)

Cleanup the unneeded variables to save memory.

notes: if you need to use these variables later, then you may remove this block or clean up unneeded variables later
the data size is quite huge, so be aware of memory usage in colab

import gc

del train, train_label, train_x, train_y, val_x, val_y
gc.collect()

Create Model

Define model architecture, you are encouraged to change and experiment with the model architecture.

import torch
import torch.nn as nn

class Classifier(nn.Module):
    def __init__(self):
        super(Classifier, self).__init__()
        self.layer1 = nn.Linear(429, 1024)
        self.layer2 = nn.Linear(1024, 512)
        self.layer3 = nn.Linear(512, 128)
        self.out = nn.Linear(128, 39) 

        self.act_fn = nn.Sigmoid()

    def forward(self, x):
        x = self.layer1(x)
        x = self.act_fn(x)

        x = self.layer2(x)
        x = self.act_fn(x)

        x = self.layer3(x)
        x = self.act_fn(x)

        x = self.out(x)
        
        return x

Training

#check device
def get_device():
  return 'cuda' if torch.cuda.is_available() else 'cpu'

Fix random seeds for reproducibility.

# fix random seed
def same_seeds(seed):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)  
    np.random.seed(seed)  
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

Feel free to change the training parameters here.

# fix random seed for reproducibility
same_seeds(0)

# get device 
device = get_device()
print(f'DEVICE: {device}')

# training parameters
num_epoch = 20               # number of training epoch
learning_rate = 0.0001       # learning rate

# the path where checkpoint saved
model_path = './model.ckpt'

# create model, define a loss function, and optimizer
model = Classifier().to(device)
criterion = nn.CrossEntropyLoss() 
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# start training

best_acc = 0.0
for epoch in range(num_epoch):
    train_acc = 0.0
    train_loss = 0.0
    val_acc = 0.0
    val_loss = 0.0

    # training
    model.train() # set the model to training mode
    for i, data in enumerate(train_loader):
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad() 
        outputs = model(inputs) 
        batch_loss = criterion(outputs, labels)
        _, train_pred = torch.max(outputs, 1) # get the index of the class with the highest probability
        batch_loss.backward() 
        optimizer.step() 

        train_acc += (train_pred.cpu() == labels.cpu()).sum().item()
        train_loss += batch_loss.item()

    # validation
    if len(val_set) > 0:
        model.eval() # set the model to evaluation mode
        with torch.no_grad():
            for i, data in enumerate(val_loader):
                inputs, labels = data
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                batch_loss = criterion(outputs, labels) 
                _, val_pred = torch.max(outputs, 1) 
            
                val_acc += (val_pred.cpu() == labels.cpu()).sum().item() # get the index of the class with the highest probability
                val_loss += batch_loss.item()

            print('[{:03d}/{:03d}] Train Acc: {:3.6f} Loss: {:3.6f} | Val Acc: {:3.6f} loss: {:3.6f}'.format(
                epoch + 1, num_epoch, train_acc/len(train_set), train_loss/len(train_loader), val_acc/len(val_set), val_loss/len(val_loader)
            ))

            # if the model improves, save a checkpoint at this epoch
            if val_acc > best_acc:
                best_acc = val_acc
                torch.save(model.state_dict(), model_path)
                print('saving model with acc {:.3f}'.format(best_acc/len(val_set)))
    else:
        print('[{:03d}/{:03d}] Train Acc: {:3.6f} Loss: {:3.6f}'.format(
            epoch + 1, num_epoch, train_acc/len(train_set), train_loss/len(train_loader)
        ))

# if not validating, save the last epoch
if len(val_set) == 0:
    torch.save(model.state_dict(), model_path)
    print('saving model at last epoch')

Testing

Create a testing dataset, and load model from the saved checkpoint.

# create testing dataset
test_set = TIMITDataset(test, None)
test_loader = DataLoader(test_set, batch_size=BATCH_SIZE, shuffle=False)

# create model and load weights from checkpoint
model = Classifier().to(device)
model.load_state_dict(torch.load(model_path))

Make prediction.

predict = []
model.eval() # set the model to evaluation mode
with torch.no_grad():
    for i, data in enumerate(test_loader):
        inputs = data
        inputs = inputs.to(device)
        outputs = model(inputs)
        _, test_pred = torch.max(outputs, 1) # get the index of the class with the highest probability

        for y in test_pred.cpu().numpy():
            predict.append(y)

Write prediction to a CSV file.

After finish running this block, download the file prediction.csv from the files section on the left-hand side and submit it to Kaggle.

with open('prediction.csv', 'w') as f:
    f.write('Id,Class\n')
    for i, y in enumerate(predict):
        f.write('{},{}\n'.format(i, y))

Kaggle

ML2021Spring-hw2

TIMIT framewise phoneme classification

www.kaggle.com

Simple Baseline

Without modifying any code, I submitted my predictions and I got 0.68868 which is more than the simple baseline of 0.68561

Strong Baseline

DNN architecture

class Classifier(nn.Module):
    def __init__(self):
        super(Classifier, self).__init__()
        self.layer1 = nn.Linear(429, 512)
        self.bn1 = nn.BatchNorm1d(512)
        
        self.layer2 = nn.Linear(512, 512)
        self.bn2 = nn.BatchNorm1d(512)

        self.layer3 = nn.Linear(512, 512)
        self.bn3 = nn.BatchNorm1d(512)

        self.layer4 = nn.Linear(512, 512)
        self.bn4 = nn.BatchNorm1d(512)

        self.layer5 = nn.Linear(512, 512)
        self.bn5 = nn.BatchNorm1d(512)

        self.layer6 = nn.Linear(512, 512)
        self.bn6 = nn.BatchNorm1d(512)

        self.layer7 = nn.Linear(512, 128)
        self.bn7 = nn.BatchNorm1d(128)

        self.drop = nn.Dropout(0.3)
        self.out = nn.Linear(128, 39)

        self.act_fn = nn.LeakyReLU()

    def forward(self, x):
        x = self.layer1(x)
        x = self.bn1(x)
        x = self.act_fn(x)
        x = self.drop(x)

        x = self.layer2(x)
        x = self.bn2(x)
        x = self.act_fn(x)
        x = self.drop(x)

        x = self.layer3(x)
        x = self.bn3(x)
        x = self.act_fn(x)
        x = self.drop(x)


        x = self.layer4(x)
        x = self.bn4(x)
        x = self.act_fn(x)
        x = self.drop(x)


        x = self.layer5(x)
        x = self.bn5(x)
        x = self.act_fn(x)
        x = self.drop(x)

        x = self.layer6(x)
        x = self.bn6(x)
        x = self.act_fn(x)
        x = self.drop(x)

        x = self.layer7(x)
        x = self.bn7(x)
        x = self.act_fn(x)
        x = self.drop(x)

        x = self.out(x)

        return x

In the subsequent experiments, I changed the activation function to LeakyReLU, deepened the neural network and added a batch normalization layer after each layer, followed by a dropout layer. After trying repeatedly, I got a private score of 0.7362. I felt disappointed that I did not pass the strong baseline.