Customer Churn: Understanding Customer Attrition

Sharath S Hebbar
4 min readDec 3, 2023

--

GitHub LinkedIn Medium Portfolio Substack

Customer churn (or customer attrition) refers to the loss of customers or subscribers for any reason at all. Businesses measure and track churn as a percentage of lost customers compared to the total number of customers over a given period. This metric is usually tracked monthly and reported at the end of the month.

Customer Churn

Let us deep dive into Customer Churn with a real-world dataset

For this experiment, we will be using the Telco Customer Churn Dataset: https://www.kaggle.com/datasets/blastchar/telco-customer-churn

Importing Libraries

import os
from pathlib import Path, PurePath

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import (
accuracy_score,
precision_score,
recall_score,
f1_score,
classification_report,
confusion_matrix
)

from xgboost import XGBClassifier
from catboost import CatBoostClassifier

import warnings
warnings.filterwarnings('ignore')

Loading Dataset

df = pd.read_csv(r"datas/WA_Fn-UseC_-Telco-Customer-Churn.csv")
df.shape

Data Visualization

fig = make_subplots(rows=n_rows, cols=n_cols, subplot_titles=titles, specs=specs)

for i in range(1, n_rows + 1): # Rows
for j in range(1, n_cols + 1): # Columns
idx = (i - 1) * 2 + j - 1
label = labels[idx]
value = values[idx]
fig.add_trace(
go.Pie(labels=label, values=value, hole=0.3, name=f'Pie {idx + 1}'),
row=i,
col=j
)

fig.update_layout(
title=dict(text="<b>Distribution Charts</b>", x=0.5, xanchor='center'),
showlegend=True,
height=3200,
width=1600
)

fig.update_traces(hole=.2, hoverinfo="label+percent+name", textfont_size=16)

fig.show()
Pie Chart Distribution of all the features.

Churn Pie Distribution

labels = list((df['Churn'].value_counts()).index)
values = df['Churn'].value_counts()

fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.3)])
title = f"<b>Churn Distribution</b>"

fig.update_layout(
title=dict(text=title, x=0.5, xanchor='center'),
showlegend=True,
)
fig.update_traces(hole=.2, hoverinfo="all", textfont_size=16)
fig.show()
Customer Churn Pie Chart

Bar Plot

def plot_hist(column):    
fig = px.histogram(df, x="Churn", color=column, title=f"<b>{column} distribution w.r.t. Churn</b>")
fig.update_layout(width=700, height=500, bargap=0.1)
fig.update_traces(hoverinfo="all", textfont_size=16)
fig.show()

for i in col:
plot_hist(i)
Payment Method Bar Chart

Distribution of total charges by churn

ax = sns.kdeplot(df.TotalCharges[(df["Churn"] == 'No') ],
color="Gold", shade = True)
ax = sns.kdeplot(df.TotalCharges[(df["Churn"] == 'Yes') ],
ax =ax, color="Green", shade= True)
ax.legend(["Not Chu0rn","Churn"],loc='upper right')
ax.set_ylabel('Density')
ax.set_xlabel('Total Charges')
ax.set_title('Distribution of total charges by churn', weight='bold')
plt.show()
Total Charges w.r.t. Churn.

Distribution of monthly charges by churn

sns.set_context("paper",font_scale=1.1)
ax = sns.kdeplot(df.MonthlyCharges[(df["Churn"] == 'No') ],
color="Red", shade = True)
ax = sns.kdeplot(df.MonthlyCharges[(df["Churn"] == 'Yes') ],
ax =ax, color="Blue", shade= True)
ax.legend(["Not Churn","Churn"],loc='upper right')
ax.set_ylabel('Density')
ax.set_xlabel('Monthly Charges')
ax.set_title('Distribution of monthly charges by churn', weight='bold')
plt.show()
Monthly Charges w.r.t Churn

Tenure vs Churn

fig = px.box(df, x='Churn', y = 'tenure')


fig.update_yaxes(title_text='Tenure (Months)', row=1, col=1)

fig.update_xaxes(title_text='Churn', row=1, col=1)

fig.update_layout(autosize=True, width=750, height=600,
title_font=dict(size=25, family='Courier'),
title='<b>Tenure vs Churn</b>',
)

fig.show()
Customers with less tenure are Churning away (New Customers)

Distplot of Tenure, Monthly Charges, Total Charges

plt.figure(figsize=(12, 8))
plt.subplot(2, 2, 1)
sns.distplot(df['tenure'], color='r')
plt.subplot(2, 2, 2)
sns.distplot(df['MonthlyCharges'], color='b')
plt.subplot(2, 2, 3)
sns.distplot(df['TotalCharges'], color='y')
plt.show()
Distplot of Tenure, MonthlyCharges, TotalCharges.
def plot_w_r_t_churn(col, title):
colors = ['#4D3425','#E4512B']
ax = col.plot(
kind='bar',
width = 0.2,
stacked = True,
rot = 0,
figsize = (8,6),
color = colors)

ax.yaxis.set_major_formatter(mtick.PercentFormatter())

ax.legend(loc='center',prop={'size':14},title = 'Churn')
ax.set_ylabel('% Customers')
ax.set_title(f'Churn by {title} Level',size = 14)

# Code to add the data labels on the stacked bar chart
for p in ax.patches:
width, height = p.get_width(), p.get_height()
x, y = p.get_xy()
ax.annotate('{:.0f}%'.format(height), (p.get_x()+.25*width, p.get_y()+.4*height),
color = 'white', size=8)
Churn w.r.t Gender

Correlation Matrix

plt.figure(figsize=(25, 15))
sns.heatmap(
df1.corr(),
annot=True,
cmap="coolwarm",
linewidths=2
)

plt.tight_layout()

plt.show()
Correlation Matrix
def barplot(col_name, y_label, x_label, title):
colors = ['#4D3425','#E4512B']
ax = (df[col_name].value_counts()*100.0 /len(df)).plot(kind='bar',stacked = True, rot = 0, color = colors)
ax.yaxis.set_major_formatter(mtick.PercentFormatter())
ax.set_ylabel(y_label)
ax.set_xlabel(x_label)
ax.set_title(title)

totals = []

for i in ax.patches:
totals.append(i.get_width())

total = sum(totals)

for i in ax.patches:
ax.text(
i.get_x()+.15,
i.get_height()-3.5,
str(round((i.get_height()/total), 1))+'%',
fontsize=12,
color='white',
weight = 'bold'
)
Bar Plots of features

Classification Model

For this project, I experimented with many classification algorithms but CatBoost gave the best results.

CatBoost Algorithm

cb = CatBoostClassifier()
cb.fit(x_train, y_train.Churn)
cb_y_pred = cb.predict(x_test)
cm = confusion_matrix(y_test.Churn, cb_y_pred)
CM(cm)
Confusion Matrix
Accuracy:  0.8184397163120567
Precision: 0.7218045112781954
Recall: 0.5133689839572193
F_Score: 0.6000000000000001
Classification Report
precision recall f1-score support

0 0.84 0.93 0.88 518
1 0.72 0.51 0.60 187

accuracy 0.82 705
macro avg 0.78 0.72 0.74 705
weighted avg 0.81 0.82 0.81 705
cb_imp = cb.feature_importances_
cb_weights = pd.Series(
cb_imp,
index=x_train.columns.values
)
cb_weights.sort_values(ascending=False)[:10].plot(kind='bar')
Feature Importance

--

--

Sharath S Hebbar

Data Science | Machine learning | Artificial Intelligence | Cloud | Internet of Things | Statistics