Building Custom Named-Entity Recognition (NER) Models — Transformers

dp
4 min readMay 18, 2023

--

Complete walk-through where take our programmatically annotated dataset from a previous post and use it to finetune a transformer.

The complete project can be found at this repository.

Data

We will be using the same dataset (ents — iob.json ) we previously built in Building Custom Named-Entity Recognition Models.

[
{
"tokens": ["(", "6:51", "-", "1st", ")", "(", "Shotgun", ")", "P.Mahomes", "scrambles", "right", "end", "to", "LAC", "34", "for", "2", "yards", "(", "S.Joseph", ";", "K.Van", "Noy", ")", ".", "FUMBLES", "(", "S.Joseph", ")", ",", "and", "recovers", "at", "LAC", "34", "."],
"labels": ["O", "B-TIME", "O", "B-PERIOD", "O", "O", "B-FORMATION", "O", "B-PLAYER", "B-EVENT", "B-DIRECTION", "O", "O", "B-TEAM", "B-QUANTITY", "O", "B-QUANTITY", "O", "O", "B-PLAYER", "O", "B-PLAYER", "I-PLAYER", "O", "O", "O", "O", "B-PLAYER", "O", "O", "O", "O", "O", "B-TEAM", "B-QUANTITY", "O"]}
},
...
]

Libraries

pip install extr-ds
pip install tensorflow
pip install transformers
pip install datasets
pip install evaluate
pip install seqeval

Setup

We will be finetuning the bert-base-cased checkpoint.

epochs = 15
model_checkpoint = 'bert-base-cased'
model_output_checkpoint = 'transformers/nfl_pbp_token_classifier'

entity_groups = [
'TIME',
'PERIOD',
'TEAM',
'PLAYER',
'POSITION',
'FORMATION',
'EVENT',
'DIRECTION',
'QUANTITY'
]

labels = ['O'] + \
[f'B-{label}' for label in entity_groups] + \
[f'I-{label}' for label in entity_groups]

label2id = { label:i for i, label in enumerate(labels) }
id2label = { i:label for i, label in enumerate(labels) }

Format Dataset

The original dataset used nltk.tokenize.word_tokenize to tokenize the text. The transformer tokenizer splits words into sub-tokens so the transition is a bit awkward. The align_labels method helps to extend our labeling to these sub-tokens as I-<entity_group> . This will allow us to utilize the aggregation_strategy=’simple’ option later when extracting entities.

import os
import random
from datasets import Dataset
from transformers import DataCollatorForTokenClassification
from extr_ds.manager.utils.filesystem import load_document

def align_labels(tokenized_inputs, label_list):
labels = []
for word_idx in tokenized_inputs.word_ids(batch_index=0):
label_id = -100
if not word_idx is None:
label = re.sub(r'^[BI]-(.+)$', r'I-\g<1>', label_list[word_idx]) \
if word_idx == previous_word_idx \
else label_list[word_idx]

label_id = label2id[label]

labels.append(label_id)
previous_word_idx = word_idx

return labels

def get_dataset(tokenizer, model):
def tokenize_and_align_labels(record):
tokenized_inputs = tokenizer(
record['tokens'],
truncation=True,
is_split_into_words=True
)

tokenized_inputs['labels'] = align_labels(
tokenized_inputs,
record['labels']
)

return tokenized_inputs

ents_dataset = json.loads(
load_document(os.path.join('4', 'ents-iob.json'))
)

random.shuffle(ents_dataset)

pivot = int(len(ents_dataset) * .8)
data_collator = DataCollatorForTokenClassification(
tokenizer,
return_tensors='tf'
)

train_dataset = Dataset.from_list(ents_dataset[:pivot])
tf_train_set = model.prepare_tf_dataset(
train_dataset.map(
tokenize_and_align_labels,
batched=False
),
shuffle=True,
collate_fn=data_collator,
)

test_dataset = Dataset.from_list(ents_dataset[pivot:])
tf_test_set = model.prepare_tf_dataset(
test_dataset.map(
tokenize_and_align_labels,
batched=False
),
shuffle=True,
collate_fn=data_collator,
)

return tf_train_set, tf_test_set

Metrics

import numpy
import evaluate
from transformers.keras_callbacks import KerasMetricCallback

seqeval = evaluate.load('seqeval')

def compute_metrics(preds):
predictions, actuals = preds
predictions = numpy.argmax(predictions, axis=2)

results = seqeval.compute(
predictions=[
[labels[p] for p, l in zip(prediction, label) if l != -100]
for prediction, label in zip(predictions, actuals)
],
references=[
[labels[l] for p, l in zip(prediction, label) if l != -100]
for prediction, label in zip(predictions, actuals)
]
)

return {
key: results[f'overall_{key}']
for key in ['precision', 'recall', 'f1', 'accuracy']
}

callbacks = [
KerasMetricCallback(
metric_fn=compute_metrics,
eval_dataset=tf_test_set
),
tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
]

Model

The default Adam optimizer learning rate is too high. Setting it lower helps with convergence.

import tensorflow as tf
from transformers import AutoTokenizer, \
TFAutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained(
model_checkpoint
)

model = TFAutoModelForTokenClassification.from_pretrained(
model_checkpoint,
num_labels=len(labels),
id2label=id2label,
label2id=label2id
)

tf_train_set, tf_test_set = get_dataset(tokenizer, model)

optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
model.compile(optimizer=optimizer)

model.fit(
x=tf_train_set,
validation_data=tf_test_set,
epochs=epochs,
callbacks=callbacks
)

Save Model

for model_to_save in [tokenizer, model]:
model_to_save.save_pretrained(model_output_checkpoint)

Invoking Custom Model

from transformers import pipeline

classifier = pipeline(
'ner',
model=model_output_checkpoint,
aggregation_strategy='simple'
)

examples = [
'(6:51 - 1st) (Shotgun) P.Mahomes scrambles right end to LAC 34 for 2 yards (S.Joseph; K.Van Noy). FUMBLES (S.Joseph), and recovers at LAC 34.',
]

responses = classifier(examples)
print(responses)

The aggregation_strategy='simple' option will combine our B / I labels into our entity_groups as shown below. See Convert Transformer Inference Output back to IOB2 Format to convert response to IOB2 Format.

[
{'entity_group': 'TIME', 'score': 0.9888856, 'word': '6 : 51', 'start': 1, 'end': 5},
{'entity_group': 'PERIOD', 'score': 0.9887093, 'word': '1st', 'start': 8, 'end': 11},
{'entity_group': 'FORMATION', 'score': 0.98260975, 'word': 'Shotgun', 'start': 14, 'end': 21},
{'entity_group': 'PLAYER', 'score': 0.9936474, 'word': 'P. Mahomes', 'start': 23, 'end': 32},
{'entity_group': 'EVENT', 'score': 0.69440436, 'word': 'scrambles', 'start': 33, 'end': 42},
{'entity_group': 'DIRECTION', 'score': 0.88298887, 'word': 'right', 'start': 43, 'end': 48},
{'entity_group': 'TEAM', 'score': 0.97735167, 'word': 'LAC', 'start': 56, 'end': 59},
{'entity_group': 'QUANTITY', 'score': 0.9734075, 'word': '34', 'start': 60, 'end': 62},
{'entity_group': 'QUANTITY', 'score': 0.9110169, 'word': '2', 'start': 67, 'end': 68},
{'entity_group': 'PLAYER', 'score': 0.9935433, 'word': 'S. Joseph', 'start': 76, 'end': 84},
{'entity_group': 'PLAYER', 'score': 0.9919572, 'word': 'K. Van Noy', 'start': 86, 'end': 95},
{'entity_group': 'PLAYER', 'score': 0.9934915, 'word': 'S. Joseph', 'start': 107, 'end': 115},
{'entity_group': 'TEAM', 'score': 0.97411484, 'word': 'LAC', 'start': 134, 'end': 137},
{'entity_group': 'QUANTITY', 'score': 0.9710606, 'word': '34', 'start': 138, 'end': 140}
]
Photo by Samule Sun on Unsplash

--

--