Customer Churn: Understanding Customer Attrition

4 min readDec 3, 2023

GitHub LinkedIn Medium Portfolio Substack

Customer churn (or customer attrition) refers to the loss of customers or subscribers for any reason at all. Businesses measure and track churn as a percentage of lost customers compared to the total number of customers over a given period. This metric is usually tracked monthly and reported at the end of the month.

Let us deep dive into Customer Churn with a real-world dataset

For this experiment, we will be using the Telco Customer Churn Dataset: https://www.kaggle.com/datasets/blastchar/telco-customer-churn

Importing Libraries

import os
from pathlib import Path, PurePath

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
    confusion_matrix
)

from xgboost import XGBClassifier
from catboost import CatBoostClassifier

import warnings
warnings.filterwarnings('ignore')

Loading Dataset

df = pd.read_csv(r"datas/WA_Fn-UseC_-Telco-Customer-Churn.csv")
df.shape

Data Visualization

fig = make_subplots(rows=n_rows, cols=n_cols, subplot_titles=titles, specs=specs)

for i in range(1, n_rows + 1):  # Rows
    for j in range(1, n_cols + 1):  # Columns
        idx = (i - 1) * 2 + j - 1
        label = labels[idx]
        value = values[idx]
        fig.add_trace(
            go.Pie(labels=label, values=value, hole=0.3, name=f'Pie {idx + 1}'),
            row=i,
            col=j
        )

fig.update_layout(
    title=dict(text="<b>Distribution Charts</b>", x=0.5, xanchor='center'),
    showlegend=True,
    height=3200,
    width=1600
)

fig.update_traces(hole=.2, hoverinfo="label+percent+name", textfont_size=16)

fig.show()

Pie Chart Distribution of all the features.

Churn Pie Distribution

labels = list((df['Churn'].value_counts()).index)
values = df['Churn'].value_counts()

fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.3)])
title = f"<b>Churn Distribution</b>"

fig.update_layout(
    title=dict(text=title, x=0.5, xanchor='center'),
    showlegend=True,
)
fig.update_traces(hole=.2, hoverinfo="all", textfont_size=16)
fig.show()

Bar Plot

def plot_hist(column):    
    fig = px.histogram(df, x="Churn", color=column, title=f"<b>{column} distribution w.r.t. Churn</b>")
    fig.update_layout(width=700, height=500, bargap=0.1)
    fig.update_traces(hoverinfo="all", textfont_size=16)
    fig.show()

for i in col:
    plot_hist(i)

Distribution of total charges by churn

ax = sns.kdeplot(df.TotalCharges[(df["Churn"] == 'No') ],
                color="Gold", shade = True)
ax = sns.kdeplot(df.TotalCharges[(df["Churn"] == 'Yes') ],
                ax =ax, color="Green", shade= True)
ax.legend(["Not Chu0rn","Churn"],loc='upper right')
ax.set_ylabel('Density')
ax.set_xlabel('Total Charges')
ax.set_title('Distribution of total charges by churn', weight='bold')
plt.show()

Distribution of monthly charges by churn

sns.set_context("paper",font_scale=1.1)
ax = sns.kdeplot(df.MonthlyCharges[(df["Churn"] == 'No') ],
                color="Red", shade = True)
ax = sns.kdeplot(df.MonthlyCharges[(df["Churn"] == 'Yes') ],
                ax =ax, color="Blue", shade= True)
ax.legend(["Not Churn","Churn"],loc='upper right')
ax.set_ylabel('Density')
ax.set_xlabel('Monthly Charges')
ax.set_title('Distribution of monthly charges by churn', weight='bold')
plt.show()

Tenure vs Churn

fig = px.box(df, x='Churn', y = 'tenure')


fig.update_yaxes(title_text='Tenure (Months)', row=1, col=1)

fig.update_xaxes(title_text='Churn', row=1, col=1)

fig.update_layout(autosize=True, width=750, height=600,
    title_font=dict(size=25, family='Courier'),
    title='<b>Tenure vs Churn</b>',
)

fig.show()

Customers with less tenure are Churning away (New Customers)

Distplot of Tenure, Monthly Charges, Total Charges

plt.figure(figsize=(12, 8))
plt.subplot(2, 2, 1)
sns.distplot(df['tenure'], color='r')
plt.subplot(2, 2, 2)
sns.distplot(df['MonthlyCharges'], color='b')
plt.subplot(2, 2, 3)
sns.distplot(df['TotalCharges'], color='y')
plt.show()

def plot_w_r_t_churn(col, title):
    colors = ['#4D3425','#E4512B']
    ax = col.plot(
        kind='bar',
        width = 0.2,
        stacked = True,
        rot = 0, 
        figsize = (8,6),
        color = colors)
    
    ax.yaxis.set_major_formatter(mtick.PercentFormatter())
    
    ax.legend(loc='center',prop={'size':14},title = 'Churn')
    ax.set_ylabel('% Customers')
    ax.set_title(f'Churn by {title} Level',size = 14)

    # Code to add the data labels on the stacked bar chart
    for p in ax.patches:
        width, height = p.get_width(), p.get_height()
        x, y = p.get_xy() 
        ax.annotate('{:.0f}%'.format(height), (p.get_x()+.25*width, p.get_y()+.4*height),
                    color = 'white', size=8)

Correlation Matrix

plt.figure(figsize=(25, 15))
sns.heatmap(
    df1.corr(),
    annot=True,
    cmap="coolwarm",
    linewidths=2
)

plt.tight_layout()

plt.show()

def barplot(col_name, y_label, x_label, title):
    colors = ['#4D3425','#E4512B']
    ax = (df[col_name].value_counts()*100.0 /len(df)).plot(kind='bar',stacked = True, rot = 0, color = colors)
    ax.yaxis.set_major_formatter(mtick.PercentFormatter())
    ax.set_ylabel(y_label)
    ax.set_xlabel(x_label)
    ax.set_title(title)

    totals = []

    for i in ax.patches:
        totals.append(i.get_width())

    total = sum(totals)

    for i in ax.patches:
        ax.text(
            i.get_x()+.15,
            i.get_height()-3.5,
            str(round((i.get_height()/total), 1))+'%',
            fontsize=12,
            color='white',
            weight = 'bold'
            )

Classification Model

For this project, I experimented with many classification algorithms but CatBoost gave the best results.

CatBoost Algorithm

cb = CatBoostClassifier()
cb.fit(x_train, y_train.Churn)
cb_y_pred = cb.predict(x_test)
cm = confusion_matrix(y_test.Churn, cb_y_pred)
CM(cm)

Accuracy:  0.8184397163120567
Precision:  0.7218045112781954
Recall:  0.5133689839572193
F_Score:  0.6000000000000001
Classification Report
              precision    recall  f1-score   support

           0       0.84      0.93      0.88       518
           1       0.72      0.51      0.60       187

    accuracy                           0.82       705
   macro avg       0.78      0.72      0.74       705
weighted avg       0.81      0.82      0.81       705

cb_imp = cb.feature_importances_
cb_weights = pd.Series(
    cb_imp,
    index=x_train.columns.values
)
cb_weights.sort_values(ascending=False)[:10].plot(kind='bar')