Analytics Vidhya
Published in

Analytics Vidhya

Using RoBERTa with fast.ai for NLP

Implementing the current state of the art NLP model in fast.ai

RoBERTa vs. other models on SuperGLUE tasks https://arxiv.org/pdf/1907.11692.pdf
RoBERTa vs. other models on SuperGLUE tasks. source

0. Prerequisites

pip install transformers

1. Setting Up the Tokenizer

from fastai.text import *
from fastai.metrics import *
from transformers import RobertaTokenizer
roberta_tok = RobertaTokenizer.from_pretrained("roberta-base")
fastai_tokenizer = Tokenizer(tok_func = FastAiRobertaTokenizer(roberta_tok, max_seq_len=256), pre_rules=[], post_rules=[])
path = Path()
roberta_tok.save_vocabulary(path)
with open('vocab.json', 'r') as f:
roberta_vocab_dict = json.load(f)

fastai_roberta_vocab = Vocab(list(roberta_vocab_dict.keys()))

2. Setting up the Databunch

class RobertaTextList(TextList):
_bunch = RobertaDataBunch
_label_cls = TextList

3. Loading the Data

df = pd.read_csv("IMDB Dataset.csv")feat_cols = "review"
label_cols = "sentiment"
processor = get_roberta_processor(tokenizer=fastai_tokenizer, vocab=fastai_roberta_vocab)data = RobertaTextList.from_df(df, ".", cols=feat_cols, processor=processor) \
.split_by_rand_pct(seed=2019) \
.label_from_df(cols=label_cols,label_cls=CategoryList) \
.databunch(bs=4, pad_first=False, pad_idx=0)

4. Building a Custom Roberta Model

roberta_model = CustomRobertatModel()

5. Train the Model

learn = Learner(data, roberta_model, metrics=[accuracy])
learn.model.roberta.train() # set roberta into train modelearn.fit_one_cycle(1, max_lr=1e-5)
.941900 accuracy in a single epoch of training
# find an appropriate lr
learn.lr_find()
learn.recorder.plot()
# unfreeze layers
learn.unfreeze()
# train using half precision
learn = learn.to_fp16()

6. Creating Predictions

def get_preds_as_nparray(ds_type) -> np.ndarray:

preds = learn.get_preds(ds_type)[0].detach().cpu().numpy()
sampler = [i for i in data.dl(ds_type).sampler]
reverse_sampler = np.argsort(sampler)
ordered_preds = preds[reverse_sampler, :]
pred_values = np.argmax(ordered_preds, axis=1)
return ordered_preds, pred_values
# For Valid
preds, pred_values = get_preds_as_nparray(DatasetType.Valid)
data = RobertaTextList.from_df(df, ".", cols=feat_cols, processor=processor) \
.split_by_rand_pct(seed=2019) \
.label_from_df(cols=label_cols,label_cls=CategoryList) \
.add_test(RobertaTextList.from_df(test_df, ".", cols=feat_cols, processor=processor)) \
.databunch(bs=4, pad_first=False, pad_idx=0)
test_preds = get_preds_as_nparray(DatasetType.Test)

--

--

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store