Building Custom Named-Entity Recognition (NER) Models — Transformers

4 min readMay 18, 2023

Complete walk-through where take our programmatically annotated dataset from a previous post and use it to finetune a transformer.

The complete project can be found at this repository.

Data

We will be using the same dataset (ents — iob.json ) we previously built in Building Custom Named-Entity Recognition Models.

[
  {
    "tokens": ["(", "6:51", "-", "1st", ")", "(", "Shotgun", ")", "P.Mahomes", "scrambles", "right", "end", "to", "LAC", "34", "for", "2", "yards", "(", "S.Joseph", ";", "K.Van", "Noy", ")", ".", "FUMBLES", "(", "S.Joseph", ")", ",", "and", "recovers", "at", "LAC", "34", "."],
    "labels": ["O", "B-TIME", "O", "B-PERIOD", "O", "O", "B-FORMATION", "O", "B-PLAYER", "B-EVENT", "B-DIRECTION", "O", "O", "B-TEAM", "B-QUANTITY", "O", "B-QUANTITY", "O", "O", "B-PLAYER", "O", "B-PLAYER", "I-PLAYER", "O", "O", "O", "O", "B-PLAYER", "O", "O", "O", "O", "O", "B-TEAM", "B-QUANTITY", "O"]}
  },
  ...
]

Building Custom Named-Entity Recognition (NER) Models

Complete walk-through where we start with a dataset, quickly annotate our dataset programmatically and finish up with a…

medium.com

Libraries

pip install extr-ds
pip install tensorflow
pip install transformers
pip install datasets
pip install evaluate
pip install seqeval

Setup

We will be finetuning the bert-base-cased checkpoint.

epochs = 15
model_checkpoint = 'bert-base-cased'
model_output_checkpoint = 'transformers/nfl_pbp_token_classifier'

entity_groups = [
  'TIME',
  'PERIOD',
  'TEAM',
  'PLAYER',
  'POSITION',
  'FORMATION',
  'EVENT',
  'DIRECTION',
  'QUANTITY'
]

labels = ['O'] + \
  [f'B-{label}' for label in entity_groups] + \
  [f'I-{label}' for label in entity_groups]

label2id = { label:i for i, label in enumerate(labels) }
id2label = { i:label for i, label in enumerate(labels) }

Format Dataset

The original dataset used nltk.tokenize.word_tokenize to tokenize the text. The transformer tokenizer splits words into sub-tokens so the transition is a bit awkward. The align_labels method helps to extend our labeling to these sub-tokens as I-<entity_group> . This will allow us to utilize the aggregation_strategy=’simple’ option later when extracting entities.

import os
import random
from datasets import Dataset
from transformers import DataCollatorForTokenClassification
from extr_ds.manager.utils.filesystem import load_document

def align_labels(tokenized_inputs, label_list):
  labels = []
  for word_idx in tokenized_inputs.word_ids(batch_index=0):
    label_id = -100
    if not word_idx is None:
      label =  re.sub(r'^[BI]-(.+)$', r'I-\g<1>', label_list[word_idx]) \
        if word_idx == previous_word_idx \
        else label_list[word_idx]

      label_id = label2id[label]

    labels.append(label_id)
    previous_word_idx = word_idx

  return labels

def get_dataset(tokenizer, model):
  def tokenize_and_align_labels(record):
    tokenized_inputs = tokenizer(
      record['tokens'],
      truncation=True,
      is_split_into_words=True
    )
  
    tokenized_inputs['labels'] = align_labels(
      tokenized_inputs,
      record['labels']
    )
  
    return tokenized_inputs

  ents_dataset = json.loads(
    load_document(os.path.join('4', 'ents-iob.json'))
  )

  random.shuffle(ents_dataset)

  pivot = int(len(ents_dataset) * .8)
  data_collator = DataCollatorForTokenClassification(
    tokenizer,
    return_tensors='tf'
  )
  
  train_dataset = Dataset.from_list(ents_dataset[:pivot])
  tf_train_set = model.prepare_tf_dataset(
    train_dataset.map(
      tokenize_and_align_labels,
      batched=False
    ),
    shuffle=True,
    collate_fn=data_collator,
  )

  test_dataset = Dataset.from_list(ents_dataset[pivot:])
  tf_test_set = model.prepare_tf_dataset(
    test_dataset.map(
      tokenize_and_align_labels,
      batched=False
    ),
    shuffle=True,
    collate_fn=data_collator,
  )

  return tf_train_set, tf_test_set

Metrics

import numpy
import evaluate
from transformers.keras_callbacks import KerasMetricCallback

seqeval = evaluate.load('seqeval')

def compute_metrics(preds):
  predictions, actuals = preds
  predictions = numpy.argmax(predictions, axis=2)

  results = seqeval.compute(
    predictions=[
      [labels[p] for p, l in zip(prediction, label) if l != -100]
      for prediction, label in zip(predictions, actuals)
    ],
    references=[
      [labels[l] for p, l in zip(prediction, label) if l != -100]
      for prediction, label in zip(predictions, actuals)
    ]
  )

  return {
    key: results[f'overall_{key}']
    for key in ['precision', 'recall', 'f1', 'accuracy']
  }

callbacks = [
  KerasMetricCallback(
    metric_fn=compute_metrics,
    eval_dataset=tf_test_set
  ),
  tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
]

Model

The default Adam optimizer learning rate is too high. Setting it lower helps with convergence.

import tensorflow as tf
from transformers import AutoTokenizer, \
                         TFAutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained(
  model_checkpoint
)

model = TFAutoModelForTokenClassification.from_pretrained(
  model_checkpoint,
  num_labels=len(labels),
  id2label=id2label,
  label2id=label2id
)

tf_train_set, tf_test_set = get_dataset(tokenizer, model)

optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
model.compile(optimizer=optimizer)

model.fit(
  x=tf_train_set,
  validation_data=tf_test_set,
  epochs=epochs,
  callbacks=callbacks
)

Save Model

for model_to_save in [tokenizer, model]:
  model_to_save.save_pretrained(model_output_checkpoint)

Invoking Custom Model

from transformers import pipeline

classifier = pipeline(
    'ner', 
    model=model_output_checkpoint,
    aggregation_strategy='simple'
)

examples = [
  '(6:51 - 1st) (Shotgun) P.Mahomes scrambles right end to LAC 34 for 2 yards (S.Joseph; K.Van Noy). FUMBLES (S.Joseph), and recovers at LAC 34.',
]

responses = classifier(examples)
print(responses)

The aggregation_strategy='simple' option will combine our B / I labels into our entity_groups as shown below. See Convert Transformer Inference Output back to IOB2 Format to convert response to IOB2 Format.

[
  {'entity_group': 'TIME', 'score': 0.9888856, 'word': '6 : 51', 'start': 1, 'end': 5},
  {'entity_group': 'PERIOD', 'score': 0.9887093, 'word': '1st', 'start': 8, 'end': 11},
  {'entity_group': 'FORMATION', 'score': 0.98260975, 'word': 'Shotgun', 'start': 14, 'end': 21},
  {'entity_group': 'PLAYER', 'score': 0.9936474, 'word': 'P. Mahomes', 'start': 23, 'end': 32},
  {'entity_group': 'EVENT', 'score': 0.69440436, 'word': 'scrambles', 'start': 33, 'end': 42}, 
  {'entity_group': 'DIRECTION', 'score': 0.88298887, 'word': 'right', 'start': 43, 'end': 48}, 
  {'entity_group': 'TEAM', 'score': 0.97735167, 'word': 'LAC', 'start': 56, 'end': 59}, 
  {'entity_group': 'QUANTITY', 'score': 0.9734075, 'word': '34', 'start': 60, 'end': 62}, 
  {'entity_group': 'QUANTITY', 'score': 0.9110169, 'word': '2', 'start': 67, 'end': 68}, 
  {'entity_group': 'PLAYER', 'score': 0.9935433, 'word': 'S. Joseph', 'start': 76, 'end': 84}, 
  {'entity_group': 'PLAYER', 'score': 0.9919572, 'word': 'K. Van Noy', 'start': 86, 'end': 95},
  {'entity_group': 'PLAYER', 'score': 0.9934915, 'word': 'S. Joseph', 'start': 107, 'end': 115}, 
  {'entity_group': 'TEAM', 'score': 0.97411484, 'word': 'LAC', 'start': 134, 'end': 137}, 
  {'entity_group': 'QUANTITY', 'score': 0.9710606, 'word': '34', 'start': 138, 'end': 140}
]

Building Custom Relation Extraction (RE) Models — Part 1

Complete 2-part walk-through where we take a dataset, iteratively annotate/label programmatically and build a Relation…