Profiling a Training Task with PyTorch Profiler and viewing it on Tensorboard

This post briefly and with an example shows how to profile a training task of a model with the help of PyTorch profiler. Developers use profiling tools for understanding the behavior of their code to be able to optimize it. TensorFlow framework provides a good ecosystem for machine learning developers and optimizer to profile their tasks. The following posts show how to use TensorFlow and TensorBoard.

PyTorch also provides the developers that choose working with this framework with profiling option that its results can be viewed as text or with TensorBoard. The first step in using PyTorch profiler and seeing the results in TensorBoard is installing the following packages:

$ pip install tensorboard
$ pip install torch_tb_profiler

The following example is the MNIST dataset classifier with a simple convolutional neural network, which its implementation is adopted from here. Pay attention to the part that start and ends with “Profiling starts here,” and “Profiling ends here” comments.


from __future__ import print_function
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import StepLR

class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(1, 32, 3, 1)
self.conv2 = nn.Conv2d(32, 64, 3, 1)
self.dropout1 = nn.Dropout(0.25)
self.dropout2 = nn.Dropout(0.5)
self.fc1 = nn.Linear(9216, 128)
self.fc2 = nn.Linear(128, 10)

def forward(self, x):
x = self.conv1(x)
x = F.relu(x)
x = self.conv2(x)
x = F.relu(x)
x = F.max_pool2d(x, 2)
x = self.dropout1(x)
x = torch.flatten(x, 1)
x = self.fc1(x)
x = F.relu(x)
x = self.dropout2(x)
x = self.fc2(x)
output = F.log_softmax(x, dim=1)
return output

def train(args, model, device, train_loader, optimizer, epoch):
model.train()
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = F.nll_loss(output, target)
loss.backward()
optimizer.step()
if batch_idx % args.log_interval == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.item()))
if args.dry_run:
break


def test(model, device, test_loader):
model.eval()
test_loss = 0
correct = 0
with torch.no_grad():
for data, target in test_loader:
data, target = data.to(device), target.to(device)
output = model(data)
test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss
pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability
correct += pred.eq(target.view_as(pred)).sum().item()

test_loss /= len(test_loader.dataset)

print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
test_loss, correct, len(test_loader.dataset),
100. * correct / len(test_loader.dataset)))

def main():
# Training settings
parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
parser.add_argument('--batch-size', type=int, default=64, metavar='N',
help='input batch size for training (default: 64)')
parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
help='input batch size for testing (default: 1000)')
parser.add_argument('--epochs', type=int, default=5, metavar='N',
help='number of epochs to train (default: 14)')
parser.add_argument('--lr', type=float, default=1.0, metavar='LR',
help='learning rate (default: 1.0)')
parser.add_argument('--gamma', type=float, default=0.7, metavar='M',
help='Learning rate step gamma (default: 0.7)')
parser.add_argument('--no-cuda', action='store_true', default=False,
help='disables CUDA training')
parser.add_argument('--dry-run', action='store_true', default=False,
help='quickly check a single pass')
parser.add_argument('--seed', type=int, default=1, metavar='S',
help='random seed (default: 1)')
parser.add_argument('--log-interval', type=int, default=10, metavar='N',
help='how many batches to wait before logging training status')
parser.add_argument('--save-model', action='store_true', default=False,
help='For Saving the current Model')
args = parser.parse_args()
use_cuda = not args.no_cuda and torch.cuda.is_available()

torch.manual_seed(args.seed)

device = torch.device("cuda" if use_cuda else "cpu")

train_kwargs = {'batch_size': args.batch_size}
test_kwargs = {'batch_size': args.test_batch_size}
if use_cuda:
cuda_kwargs = {'num_workers': 1,
'pin_memory': True,
'shuffle': True}
train_kwargs.update(cuda_kwargs)
test_kwargs.update(cuda_kwargs)

transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])
dataset1 = datasets.MNIST('../data', train=True, download=True,
transform=transform)
dataset2 = datasets.MNIST('../data', train=False,
transform=transform)
train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs)
test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs)

model = Net().to(device)
optimizer = optim.Adadelta(model.parameters(), lr=args.lr)

scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)

# Defining the profiler
prof = torch.profiler.profile(
activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
#schedule=torch.profiler.schedule(wait=1, warmup=1, active=1),
on_trace_ready=torch.profiler.tensorboard_trace_handler('./logs/mnist'),
#record_shapes=True,
#profile_memory=True,
#with_stack=True,
#with_flops=True,
#with_modules=True
)
# Profiling starts here
prof.start()
for epoch in range(1, args.epochs + 1):
print("epoch = ", epoch, device)
train(args, model, device, train_loader, optimizer, epoch)
prof.step()
prof.stop()

# Profiling ends here

test(model, device, test_loader)
scheduler.step()

print(prof.key_averages())

if args.save_model:
torch.save(model.state_dict(), "mnist_cnn.pt")


if __name__ == '__main__':
t1 = time.perf_counter()
main()
t2 = time.perf_counter()
execution_time = t2 - t1
print("time: ", t2-t1)

Note:

  1. Try to specify the code that you want it to be specified. Like the train() function in the example code.
  2. Specify the profiling steps by giving the direction to the profiler.
  3. On using scheduler make sure to do the training for an enough number of epochs.
  4. The size of the file may be large if you profile for all epochs that TensorBoard will have problem loading it. It can grow to size of GBs for this tiny example model.
  5. Never profile all epochs and a right practice is to profile an iteration. The reason is that if we profile even an epoch, we will go over dataset and gather unnecessarily excessively repetitive information. So, in the example, it is better to go inside the training function and profile an iteration.

Viewing on the Tensorboard

For viewing the profiling information on the Tensorboard:

$ tensorboard --logdir=/logs/mnist

If you use a remote server with an ssh connection, you can learn from the following post how to have a remote view of Tensorboard on your local system’s browser.

References

--

--