Predicting Hospital Readmission using NLP


Part 1

Part 2

Intended Audience

Model Definition

About the Dataset

Step 1. Prepare the Data


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
df_adm = pd.read_csv('ADMISSIONS.csv')

Explore the data


Remove NEWBORN admissions

df_adm = df_adm.loc[df_adm.ADMISSION_TYPE != 'NEWBORN']# Ensure rows with NEWBORN ADMISSION_TYPE are removed

Remove Deaths

# Before removing deaths, first store the hadm_ids for dead patients. It will be used later to also remove deaths from the notes table
hadm_rows_death = df_adm.loc[df_adm.DEATHTIME.notnull()]
print("Number of death admissions:", len(hadm_rows_death))
# Store HADM_ID for dead patients in a list
hadm_death_list = hadm_rows_death["HADM_ID"].tolist()
print("Length of the HADM_ID list:", len(hadm_death_list))
df_adm = df_adm.loc[df_adm.DEATHTIME.isnull()]
print('Total rows in admissions dataframe:', len(df_adm))
print('Non-death admissions:', df_adm.DEATHTIME.isnull().sum())

Convert strings to dates

# Convert to dates
df_adm.ADMITTIME = pd.to_datetime(df_adm.ADMITTIME, format = '%Y-%m-%d %H:%M:%S', errors = 'coerce')
df_adm.DISCHTIME = pd.to_datetime(df_adm.DISCHTIME, format = '%Y-%m-%d %H:%M:%S', errors = 'coerce')
df_adm.DEATHTIME = pd.to_datetime(df_adm.DEATHTIME, format = '%Y-%m-%d %H:%M:%S', errors = 'coerce')
# Check to see if there are any missing dates
print('Number of missing admissions dates:', df_adm.ADMITTIME.isnull().sum())
print('Number of missing discharge dates:', df_adm.DISCHTIME.isnull().sum())

Get the next Unplanned admission date for each patient (if it exists)

# sort by subject_ID and admission date
df_adm = df_adm.sort_values(['SUBJECT_ID','ADMITTIME'])
# When we reset the index, the old index is added as a column, and a new sequential index is used. Use the 'drop' parameter to avoid the old index being added as a column
df_adm = df_adm.reset_index(drop = True)
Source: https://towardsdatascience.com/introduction-to-clinical-natural-language-processing-predicting-hospital-readmission-with-1736d52bc709#5759
# Create a column and put the 'next admission date' for each subject using groupby. You have to use groupby otherwise the dates will be from different subjects
df_adm['NEXT_ADMITTIME'] = df_adm.groupby('SUBJECT_ID').ADMITTIME.shift(-1)
# Same as above. Create a column that holds the 'next admission type' for each subject
df_adm['NEXT_ADMISSION_TYPE'] = df_adm.groupby('SUBJECT_ID').ADMISSION_TYPE.shift(-1)
# For rows with 'elective' admissions, replace it with NaT and NaN
df_adm.loc[rows,'NEXT_ADMITTIME'] = pd.NaT
df_adm.loc[rows,'NEXT_ADMISSION_TYPE'] = np.NaN
# Sort by subject_id and admission date
# It's safer to sort right before the fill incase something I did above changed the order
df_adm = df_adm.sort_values(['SUBJECT_ID','ADMITTIME'])
# Back fill. This will take a little while.
df_adm[['NEXT_ADMITTIME','NEXT_ADMISSION_TYPE']] = df_adm.groupby(['SUBJECT_ID'])[['NEXT_ADMITTIME','NEXT_ADMISSION_TYPE']].fillna(method = 'bfill')

Calculate days until next admission

df_adm['DAYS_TIL_NEXT_ADMIT'] = (df_adm.NEXT_ADMITTIME - df_adm.DISCHTIME).dt.total_seconds()/(24*60*60)
plt.hist(df_adm.loc[~df_adm.DAYS_TIL_NEXT_ADMIT.isnull(),'DAYS_TIL_NEXT_ADMIT'], bins =range(0,365,30))
plt.xlabel('Days between admissions')
plt.xticks(np.arange(0, 365, 30))
plt.title('Histogram of 30-day unplanned re-admissions over 365 days')


df_notes = pd.read_csv("NOTEEVENTS.csv")

I’ll spend some time exploring the data


Investigate why HADM_ID’s are missing

Remove notes for death admissions

df_notes = df_notes[~df_notes['HADM_ID'].isin(hadm_death_list)]

Concatenate Notes for Each Patient

# Convert CHARTDATE from string to a datetime format
df_notes.CHARTDATE = pd.to_datetime(df_notes.CHARTDATE, format = '%Y-%m-%d', errors = 'coerce')
# Convert CHARTTIME to datetime format
df_notes.CHARTTIME = pd.to_datetime(df_notes.CHARTTIME, format = '%Y-%m-%d %H:%M:%S', errors = 'coerce')
# Sort by subject_ID, CHARTDATE then CHARTTIME
df_notes = df_notes.sort_values(['SUBJECT_ID','CHARTDATE', 'CHARTTIME'])
df_notes = df_notes.reset_index(drop = True)
# Copy over two columns to new dataframe
df_subj_concat_notes = df_notes[['SUBJECT_ID', 'TEXT']].copy()
df_subj_concat_notes = df_subj_concat_notes.groupby('SUBJECT_ID')['TEXT'].agg(' '.join).reset_index()# Rename the column in new dataframe to TEXT_CONCAT
df_subj_concat_notes.rename(columns={"TEXT":"TEXT_CONCAT"}, inplace=True)
assert df_subj_concat_notes.duplicated(['SUBJECT_ID']).sum() == 0, 'Dulpicate SUBJECT_IDs exist'

Merge Datasets

on = ['SUBJECT_ID'],
how = 'left')
assert len(df_adm) == len(df_adm_notes), 'Number of rows increased'

Make Output Label

# Create new column of 1's or 0's based on DAYS_TIL_NEXT_ADMIT
df_adm_notes['OUTPUT_LABEL'] = (df_adm_notes.DAYS_TIL_NEXT_ADMIT < 30).astype('int')
print('Number of positive samples:', (df_adm_notes.OUTPUT_LABEL == 1).sum())
print('Number of negative samples:', (df_adm_notes.OUTPUT_LABEL == 0).sum())
print('Total samples:', len(df_adm_notes))
# Take only the 3 essential columns
df_adm_notes_squashed = df_adm_notes[['SUBJECT_ID', 'TEXT_CONCAT', 'OUTPUT_LABEL']]
df_subj_labels_squashed = df_adm_notes_squashed.groupby('SUBJECT_ID')[['OUTPUT_LABEL']].sum().reset_index()
df_subj_labels_squashed.rename(columns={"OUTPUT_LABEL":"OUTPUT_LABELS_SUMMED"}, inplace=True)
df_subj_labels_squashed['OUTPUT_LABEL'] = (df_subj_labels_squashed['OUTPUT_LABELS_SUMMED'] >= 1).astype(int)
df_subj_labels_squashed.drop(columns=['OUTPUT_LABELS_SUMMED'], inplace=True)
df_adm_notes_squashed.drop(columns=['OUTPUT_LABEL'], inplace=True)
df_adm_notes_squashed.drop_duplicates(subset='SUBJECT_ID', keep='first', inplace=True)
print('Length of df_adm_notes_squashed:', len(df_adm_notes_squashed))
print('Length of df_subj_labels_squashed:', len(df_subj_labels_squashed))
df_adm_notes_merged = pd.merge(df_subj_labels_squashed[['SUBJECT_ID','OUTPUT_LABEL']],
on = ['SUBJECT_ID'],
how = 'left')
assert len(df_subj_labels_squashed) == len(df_adm_notes_merged), 'Number of rows increased'

Make Training / Validation / Test sets

df_adm_notes_merged = df_adm_notes_merged.sample(n=len(df_adm_notes_merged), random_state=42)
df_adm_notes_merged = df_adm_notes_merged.reset_index(drop=True)
df_valid_and_test = df_adm_notes_merged.sample(frac=0.20, random_state=42)df_test = df_valid_and_test.sample(frac=0.5, random_state=42)
df_valid = df_valid_and_test.drop(df_test.index)
df_train = df_adm_notes_merged.drop(df_valid_and_test.index)assert len(df_adm_notes_merged) == (len(df_test)+len(df_valid)+len(df_train)),"Split wasn't done mathetmatically correct."


print("Training set prevalence (n = {:d}):".format(len(df_train)), "{:.2f}%".format((df_train.OUTPUT_LABEL.sum()/len(df_train))*100))print("Validation set prevalence (n = {:d}):".format(len(df_valid)), "{:.2f}%".format((df_valid.OUTPUT_LABEL.sum()/len(df_valid))*100))print("Test set prevalence (n = {:d}):".format(len(df_test)), "{:.2f}%".format((df_test.OUTPUT_LABEL.sum()/len(df_test))*100))print("All samples (n = {:d})".format(len(df_adm_notes_merged)))
# Split the training data into positive and negative outputs
pos_rows = df_train.OUTPUT_LABEL == 1
df_train_pos = df_train.loc[pos_rows]
df_train_neg = df_train.loc[~pos_rows]
# Merge the balanced data
df_train = pd.concat([df_train_pos, df_train_neg.sample(n = len(df_train_pos), random_state=42)], axis = 0)
# Shuffle the order of training samples
df_train = df_train.sample(n = len(df_train), random_state = 42).reset_index(drop=True)
print("Training set prevalence (n = {:d}):".format(len(df_train)), "{:.2f}%".format((df_train.OUTPUT_LABEL.sum()/len(df_train))*100))

Step 2: Preprocessing Pipeline

Bag of Words (BOW) -> Count Vectorizer

Clean and Tokenize

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import re
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer # lemmatizes word based on its parts of speech
print('Punctuation:', string.punctuation)
print('NLTK English Stop Words:', '\n', stopwords.words('english'))


def convert_tag(treebank_tag):
'''Convert Treebank tags to WordNet tags'''
if treebank_tag.startswith('J'):
return 'a'
elif treebank_tag.startswith('V'):
return 'v'
elif treebank_tag.startswith('N'):
return 'n'
elif treebank_tag.startswith('R'):
return 'r'
return 'n' # if no match, default to noun
def lemmatizer(tokens):
Performs lemmatization.
tokens (list of strings): cleaned tokens with stopwords removed
lemma_words (list of strings): lemmatized words
# POS-tag your data before lemmatizing
tagged_words = pos_tag(tokens) # outputs list of tuples [('recent', 'JJ'),...]

# Lemmatize using WordNet's built-in morphy function. Returns the input word unchanged if it cannot be found in WordNet.
wnl = WordNetLemmatizer()

lemma_words = []

# Lemmatize list of tuples, output a list of strings
for tupl in tagged_words:
lemma_words.append(wnl.lemmatize(tupl[0], convert_tag(tupl[1])))

return lemma_words
def preprocess_and_tokenize(text):
Clean the data.
text (string): full original, uncleaned text
lemmatized_tokens (list of strings): cleaned words
# Make text lowercase
text = text.lower()

# Remove punctuation
text = re.sub('[%s]' % re.escape(string.punctuation), '', text)

# Remove numbers and words that contain numbers
text = re.sub('\w*\d\w*', '', text)

# Remove newline chars and carriage returns
text = re.sub('\n', '', text)
text = re.sub('\r', '', text)

# Tokenize
word_tokens = word_tokenize(text)

# Remove stop words
tokens = [word for word in word_tokens if word not in stopwords.words('english')]
# Call lemmatizer function above to perform lemmatization
lemmatized_tokens = lemmatizer(tokens)

return lemmatized_tokens

Count Vectorizer

Build a vectorizer on the clinical notes

from sklearn.feature_extraction.text import CountVectorizervect = CountVectorizer(max_features=3000, tokenizer=preprocess_and_tokenize)
# I applied .astype(str) to fix the ValueError: np.nan is an invalid document, expected byte or unicode string.xc# create the vectorizer

Zipf’s Law

neg_doc_matrix = vect.transform(df_train[df_train.OUTPUT_LABEL == 0].TEXT_CONCAT.astype(str))
pos_doc_matrix = vect.transform(df_train[df_train.OUTPUT_LABEL == 1].TEXT_CONCAT.astype(str))
neg_tf = np.sum(neg_doc_matrix,axis=0)
pos_tf = np.sum(pos_doc_matrix,axis=0)
neg = np.squeeze(np.asarray(neg_tf))
pos = np.squeeze(np.asarray(pos_tf))
# Could take a while
X_train_tf = vect.transform(df_train.TEXT_CONCAT.values.astype(str))
X_valid_tf = vect.transform(df_valid.TEXT_CONCAT.values.astype(str))
y_train = df_train.OUTPUT_LABEL
y_valid = df_valid.OUTPUT_LABEL

Step 3: Build a simple predictive model

from sklearn.linear_model import LogisticRegression# Classifier
clf = LogisticRegression(C = 0.0001, penalty = 'l2', random_state = 42)
clf.fit(X_train_tf, y_train)
model = clf
y_train_preds = model.predict_proba(X_train_tf)[:,1]
y_valid_preds = model.predict_proba(X_valid_tf)[:,1]
df_training_prob = pd.DataFrame([y_train[:10].values, y_train_preds[:10]]).transpose()
df_training_prob.columns = ['Actual', 'Probability']
df_training_prob = df_training_prob.astype({"Actual": int})

Visualize top words for positive and negative classes.

def get_most_important_features(vectorizer, model, n=5):
index_to_word = {v:k for k,v in vectorizer.vocabulary_.items()}

# loop for each class
classes ={}
for class_index in range(model.coef_.shape[0]):
word_importances = [(el, index_to_word[i]) for i,el in enumerate(model.coef_[class_index])]
sorted_coeff = sorted(word_importances, key = lambda x : x[0], reverse=True)
tops = sorted(sorted_coeff[:n], key = lambda x : x[0])
bottom = sorted_coeff[-n:]
classes[class_index] = {
return classes
def plot_important_words(top_scores, top_words, bottom_scores, bottom_words, name):
y_pos = np.arange(len(top_words))
top_pairs = [(a,b) for a,b in zip(top_words, top_scores)]
top_pairs = sorted(top_pairs, key=lambda x: x[1])

bottom_pairs = [(a,b) for a,b in zip(bottom_words, bottom_scores)]
bottom_pairs = sorted(bottom_pairs, key=lambda x: x[1], reverse=True)

top_words = [a[0] for a in top_pairs]
top_scores = [a[1] for a in top_pairs]

bottom_words = [a[0] for a in bottom_pairs]
bottom_scores = [a[1] for a in bottom_pairs]

fig = plt.figure(figsize=(10, 15))
plt.barh(y_pos,bottom_scores, align='center', alpha=0.5)
plt.title('Negative', fontsize=20)
plt.yticks(y_pos, bottom_words, fontsize=14)
plt.suptitle('Key words', fontsize=16)
plt.xlabel('Importance', fontsize=20)

plt.barh(y_pos,top_scores, align='center', alpha=0.5)
plt.title('Positive', fontsize=20)
plt.yticks(y_pos, top_words, fontsize=14)
plt.suptitle(name, fontsize=16)
plt.xlabel('Importance', fontsize=20)

importance = get_most_important_features(vect, clf, 50)top_scores = [a[0] for a in importance[0]['tops']]
top_words = [a[1] for a in importance[0]['tops']]
bottom_scores = [a[0] for a in importance[0]['bottom']]
bottom_words = [a[1] for a in importance[0]['bottom']]
plot_important_words(top_scores, top_words, bottom_scores, bottom_words, "Most important words")

Step 4: Calculate Performance Metrics


Evaluating Classifier Performance


Plot the Learning Curve

Hyperparameter Tuning

Grid Search

Random Search

What comes after tuning?

Step 6: Run Model on Test set

# Could take some hours
X_test_tf = vect.transform(df_test.TEXT_CONCAT.values.astype(str))
# Get the output labels as separate variables.
y_test = df_test.OUTPUT_LABEL
# Calculate the probability of readmission for each sample with the fitted model
y_test_preds = model.predict_proba(X_test_tf)[:,1]
# Print performance metrics
test_recall = calc_recall(y_test, y_test_preds, thresh)
test_precision = calc_precision(y_test, y_test_preds, thresh)
test_prevalence = calc_prevalence(y_test)
auc_test = roc_auc_score(y_test, y_test_preds)
print('Train prevalence(n = %d): %.3f'%(len(y_train), train_prevalence))
print('Valid prevalence(n = %d): %.3f'%(len(y_valid), valid_prevalence))
print('Test prevalence(n = %d): %.3f'%(len(y_test), test_prevalence))
print('Train AUC:%.3f'%auc_train)
print('Valid AUC:%.3f'%auc_valid)
print('Valid AUC:%.3f'%auc_test)
print('Train recall:%.3f'%train_recall)
print('Valid recall:%.3f'%valid_recall)
print('Test recall:%.3f'%test_recall)
print('Train precision:%.3f'%train_precision)
print('Valid precision:%.3f'%valid_precision)
print('Test precision:%.3f'%test_precision)
# Quickly throw metrics into a table format. It's easier to eyeball it for a side-by-side comparison.
df_test_perf_metrics = pd.DataFrame([
[train_prevalence, valid_prevalence, test_prevalence],
[train_recall, valid_recall, test_recall],
[train_precision, valid_precision, test_precision],
[auc_train, auc_valid, auc_test]], columns=['Training', 'Validation', 'Test'])
df_test_perf_metrics.rename(index={0:'Prevalence', 1:'Recall', 2:'Precision', 3:'AUC'},

Conclusion — Benchmark the Results

Next Steps —> BERT Transformer, Deep Learning




As CTO and founder of GitLinks I led our technology acquisition (acquired by multibillion-dollar enterprise software company, Infor).

Love podcasts or audiobooks? Learn on the go with our new app.

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store
Nwamaka Imasogie

Nwamaka Imasogie

As CTO and founder of GitLinks I led our technology acquisition (acquired by multibillion-dollar enterprise software company, Infor).