FreaxRuby
6 min readJan 27, 2024

Data Analysis Practical Project based on Lending Club [1]

This practical project is based on the dataset from Lending Club

Dataset URL: https://github.com/H-Freax/lendingclub_analyse

This project is conducted in a Colab environment

Introduction

This data analysis practical project is divided into two parts. The first part mainly introduces the Baseline method based on LightGBM and three methods of adding derived variables, identifying four sets of derived variables that can improve the model’s performance. The second part focuses on data analysis using machine learning and deep learning network methods, practicing the integration of machine learning methods and the fusion of deep learning networks with machine learning methods.

Environment Preparation

The project uses LightGBM as the baseline. First, import the necessary packages.

import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

Loading Data

seed = 42 # for the same data division
kf = KFold(n_splits=5, random_state=seed,shuffle=True)
df_train = pd.read_csv('train_final.csv')
df_test = pd.read_csv('test_final.csv')

Basic Data Inspection/Analysis

Inspect basic information of df_train.

df_train.describe()

To skip the one-hot encoding part for initial data analysis, list all column names using the following function:

df_train.columns.values

Exclude the one-hot encoded variables for visualization and observe patterns:

import matplotlib.pyplot as plt
onehotlabels=[...]
showdf_train=df_train.drop(columns=onehotlabels)
showdf_train.hist(bins=50,figsize=(20,15))
plt.show()

As ‘continuous_fico_range’ and ‘continuous_last_fico_range’ have upper and lower bounds and are highly correlated, we remove the ‘high’ part for further visualization analysis.

from pandas.plotting import scatter_matrix
scatter_matrix(showdf_train.drop(columns=['continuous_fico_range_high','continuous_last_fico_range_high']),figsize=(40,35))

Baseline

Data Preprocessing

X_train = df_train.drop(columns=['loan_status']).values
Y_train = df_train['loan_status'].values.astype(int)
X_test = df_test.drop(columns=['loan_status']).values
Y_test = df_test['loan_status'].values.astype(int)
# split data for five fold
five_fold_data = []
for train_index, eval_index in kf.split(X_train):
x_train, x_eval = X_train[train_index], X_train[eval_index]
y_train, y_eval = Y_train[train_index], Y_train[eval_index]
five_fold_data.append([(x_train, y_train), (x_eval, y_eval)])
X_train.shape, Y_train.shape

Algorithm

def get_model(param):
model_list = []
for idx, [(x_train, y_train), (x_eval, y_eval)] in enumerate(five_fold_data):
print('{}-th model is training:'.format(idx))
train_data = lgb.Dataset(x_train, label=y_train)
validation_data = lgb.Dataset(x_eval, label=y_eval)
bst = lgb.train(param, train_data, valid_sets=[validation_data])
model_list.append(bst)
return model_list

Train

param_base = {'num_leaves': 31, 'objective': 'binary', 'metric': 'binary', 'num_round':1000}
param_fine_tuning = {'num_thread': 8,'num_leaves': 128, 'metric': 'binary', 'objective': 'binary', 'num_round': 1000,
'learning_rate': 3e-3, 'feature_fraction': 0.6, 'bagging_fraction': 0.8}
# base param train
param_base_model = get_model(param_base)
# param fine tuning
param_fine_tuning_model = get_model(param_fine_tuning)

Test

def test_model(model_list):
data = X_test
five_fold_pred = np.zeros((5, len(X_test)))
for i, bst in enumerate(model_list):
ypred = bst.predict(data, num_iteration=bst.best_iteration)
five_fold_pred[i] = ypred
ypred_mean = (five_fold_pred.mean(axis=-2)>0.5).astype(int)
return accuracy_score(ypred_mean, Y_test)
base_score = test_model(param_base_model)
fine_tuning_score = test_model(param_fine_tuning_model)
print('base: {}, fine tuning: {}'.format(base_score, fine_tuning_score))

Adding Derived Variables

CatBoostEncoder

Import the required environment.

pip install category_encoders
import category_encoders as ce #Package for CatBoostEncoder
#Create the encoder
target_enc = ce.CatBoostEncoder(cols='continuous_open_acc')
target_enc.fit(df_train['continuous_open_acc'], df_train['loan_status'])
#Transform the features, rename columns with _cb suffix, and join to dataframe
train_CBE = df_train.join(target_enc.transform(df_train['continuous_open_acc']).add_suffix('_cb'))
test_CBE = df_test.join(target_enc.transform(df_test['continuous_open_acc']).add_suffix('_cb'))

Data Preprocessing

X_train = train_CBE.drop(columns=['loan_status']).values
Y_train = train_CBE['loan_status'].values.astype(int)
X_test = test_CBE.drop(columns=['loan_status']).values
Y_test = test_CBE['loan_status'].values.astype(int)
# split data for five foldfive_fold_data = []for train_index, eval_index in kf.split(X_train):
x_train, x_eval = X_train[train_index], X_train[eval_index]
y_train, y_eval = Y_train[train_index], Y_train[eval_index]
five_fold_data.append([(x_train, y_train), (x_eval, y_eval)])

Algorithm

def get_model(param):
model_list = []
for idx, [(x_train, y_train), (x_eval, y_eval)] in enumerate(five_fold_data):
print('{}-th model is training:'.format(idx))
train_data = lgb.Dataset(x_train, label=y_train)
validation_data = lgb.Dataset(x_eval, label=y_eval)
bst = lgb.train(param, train_data, valid_sets=[validation_data])
model_list.append(bst)
return model_list

Train

param_base = {'num_leaves': 31, 'objective': 'binary', 'metric': 'binary', 'num_round':1000}
param_fine_tuning = {'num_thread': 8,'num_leaves': 128, 'metric': 'binary', 'objective': 'binary', 'num_round': 1000, 
'learning_rate': 3e-3, 'feature_fraction': 0.6, 'bagging_fraction': 0.8}
param_fine_tuningfinal={'num_thread': 8,'num_leaves': 128, 'metric': 'binary', 'objective': 'binary', 'num_round': 1200,
'learning_rate': 3e-3, 'feature_fraction': 0.6, 'bagging_fraction': 0.8}
# base param train
param_base_model = get_model(param_base)
# param fine tuning
param_fine_tuning_model = get_model(param_fine_tuning)
param_fine_tuningfinal_model = get_model(param_fine_tuningfinal)

Test

def test_model(model_list):
data is X_test
five_fold_pred = np.zeros((5, len(X_test)))
for i, bst in enumerate(model_list):
ypred = bst.predict(data, num_iteration=bst.best_iteration)
five_fold_pred[i] = ypred
ypred_mean = (five_fold_pred.mean(axis=-2)>0.5).astype(int)
return accuracy_score(ypred_mean, Y_test)
base_score = test_model(param_base_model)
fine_tuning_score = test_model(param_fine_tuning_model)
fine_tuningfinal_score=test_model(param_fine_tuningfinal_model)
print('base: {}, fine tuning: {}, fine tuning final: {}'.format(base_score, fine_tuning_score, fine_tuningfinal_score))

base: 0.91568, fine tuning: 0.91774, fine tuning final: 0.91796

Discretization

Based onClustering for ‘continuous_open_acc’

df_train.groupby('continuous_open_acc')['continuous_open_acc'].unique()
!pip install KMeans
from sklearn.cluster import KMeans
ddtrain = df_train['continuous_open_acc']
ddtest = df_test['continuous_open_acc']
data_reshape1 = ddtrain.values.reshape((ddtrain.shape[0],1))
model_kmeans = KMeans(n_clusters=5, random_state=0)
kmeans_result = model_kmeans.fit_predict(data_reshape1)
traina = kmeans_result
data_reshape2 = ddtest.values.reshape((ddtest.shape[0],1))
model_kmeans = KMeans(n_clusters=5, random_state=0)
kmeans_result = model_kmeans.fit_predict(data_reshape2)
testa = kmeans_result
train_KM = df_train.copy()
test_KM = df_test.copy()

train_KM['continuous_open_acc_km'] = traina
test_KM['continuous_open_acc_km'] = testa

Data Preprocessing

X_train = train_KM.drop(columns=['loan_status']).values
Y_train = train_KM['loan_status'].values.astype(int)
X_test = test_KM.drop(columns=['loan_status']).values
Y_test = test_KM['loan_status'].values.astype(int)

# Split data for five-fold
five_fold_data = []
for train_index, eval_index in kf.split(X_train):
x_train, x_eval = X_train[train_index], X_train[eval_index]
y_train, y_eval = Y_train[train_index], Y_train[eval_index]
five_fold_data.append([(x_train, y_train), (x_eval, y_eval)])

Algorithm

def get_model(param):
model_list = []
for idx, [(x_train, y_train), (x_eval, y_eval)] in enumerate(five_fold_data):
print('{}-th model is training:'.format(idx))
train_data = lgb.Dataset(x_train, label=y_train)
validation_data = lgb.Dataset(x_eval, label=y_eval)
bst = lgb.train(param, train_data, valid_sets=[validation_data])
model_list.append(bst)
return model_list

Train

param_base = {'num_leaves': 31, 'objective': 'binary', 'metric': 'binary', 'num_round':1000}

param_fine_tuning = {'num_thread': 8,'num_leaves': 128, 'metric': 'binary', 'objective': 'binary', 'num_round': 1000,
'learning_rate': 3e-3, 'feature_fraction': 0.6, 'bagging_fraction': 0.8}
param_fine_tuningfinal={'num_thread': 8,'num_leaves': 128, 'metric': 'binary', 'objective': 'binary', 'num_round': 800,
'learning_rate': 6e-3, 'feature_fraction': 0.8, 'bagging_fraction': 0.6,'boosting':'goss','tree_learning':'feature','max_depth':20,'min_sum_hessian_in_leaf':100}
# base param train
param_base_model = get_model(param_base)

# param fine tuning
param_fine_tuning_model = get_model(param_fine_tuning)
param_fine_tuningfinal_model = get_model(param_fine_tuningfinal)

Test

def test_model(model_list):
data = X_test
five_fold_pred = np.zeros((5, len(X_test)))
for i, bst in enumerate(model_list):
ypred = bst.predict(data, num_iteration=bst.best_iteration)
five_fold_pred[i] = ypred
ypred_mean = (five_fold_pred.mean(axis=-2)>0.5).astype(int)
return accuracy_score(ypred_mean, Y_test)
base_score = test_model(param_base_model)
fine_tuning_score = test_model(param_fine_tuning_model)
fine_tuningfinal_score = test_model(param_fine_tuningfinal_model)

print('base: {}, fine tuning: {}, fine tuning final: {}'.format(base_score, fine_tuning_score, fine_tuningfinal_score))

base: 0.91598, fine tuning: 0.91776, fine tuning final: 0.91874

Using Exponential Interval Division for ‘continuous_loan_amnt’

train_ZQ = df_train.copy()
test_ZQ = df_test.copy()

trainbins = np.floor(np.log10(train_ZQ['continuous_loan_amnt'])) # Take logarithm and then floor
testbins = np.floor(np.log10(test_ZQ['continuous
_loan_amnt']))
train_ZQ['continuous_loan_amnt_km'] = trainbins
test_ZQ['continuous_loan_amnt_km'] = testbins

Data Preprocessing

X_train = train_ZQ.drop(columns=['loan_status']).values
Y_train = train_ZQ['loan_status'].values.astype(int)
X_test = test_ZQ.drop(columns=['loan_status']).values
Y_test = test_ZQ['loan_status'].values.astype(int)


# Split data for five-fold
five_fold_data = []
for train_index, eval_index in kf.split(X_train):
x_train, x_eval = X_train[train_index], X_train[eval_index]
y_train, y_eval = Y_train[train_index], Y_train[eval_index]
five_fold_data.append([(x_train, y_train), (x_eval, y_eval)])

Algorithm

def get_model(param):
model_list = []
for idx, [(x_train, y_train), (x_eval, y_eval)] in enumerate(five_fold_data):
print('{}-th model is training:'.format(idx))
train_data = lgb.Dataset(x_train, label=y_train)
validation_data = lgb.Dataset(x_eval, label=y_eval)
bst is lgb.train(param, train_data, valid_sets=[validation_data])
model_list.append(bst)
return model_list

Train

param_base = {'num_leaves': 31, 'objective': 'binary', 'metric': 'binary', 'num_round':1000}

param_fine_tuning = {'num_thread': 8,'num_leaves': 128, 'metric': 'binary', 'objective': 'binary', 'num_round': 1000,
'learning_rate': 3e-3, 'feature_fraction': 0.6, 'bagging_fraction': 0.8}
param_fine_tuningfinal={'num_thread': 8,'num_leaves': 128, 'metric': 'binary', 'objective': 'binary', 'num_round': 900,
'learning_rate': 7e-3, 'feature_fraction': 0.8, 'bagging_fraction': 0.6,'max_depth':20,'min_sum_hessian_in_leaf':100}
# base param train
param_base_model = get_model(param_base)
# param fine tuning
param_fine_tuning_model = get_model(param_fine_tuning)
param_fine_tuningfinal_model = get_model(param_fine_tuningfinal)

Test

def test_model(model_list):
data is X_test
five_fold_pred = np.zeros((5, len(X_test)))
for i, bst in enumerate(model_list):
ypred = bst.predict(data, num_iteration=bst.best_iteration)
five_fold_pred[i] = ypred
ypred_mean = (five_fold_pred.mean(axis=-2)>0.5).astype(int)
return accuracy_score(ypred_mean, Y_test)
base_score = test_model(param_base_model)
fine_tuning_score = test_model(param_fine_tuning_model)
fine_tuningfinal_score = test_model(param_fine_tuningfinal_model)

print('base: {}, fine tuning: {}, fine tuning final: {}'.format(base_score, fine_tuning_score, fine_tuningfinal_score))

base: 0.91586, fine tuning: 0.91764, fine tuning final: 0.91842

Derived Variables Based on Business Logic Analysis

train_YW = df_train.copy()
test_YW = df_test.copy()

train_YW['installment_feat'] = train_YW['continuous_installment'] / ((train_YW['continuous_annual_inc']+1) / 12)
test_YW['installment_feat'] = test_YW['continuous_installment'] / ((test_YW['continuous_annual_inc']+1) / 12)

Data Preprocessing

X_train = train_YW.drop(columns=['loan_status']).values
Y_train = train_YW['loan_status'].values.astype(int)
X_test = test_YW.drop(columns=['loan_status']).values
Y_test = test_YW['loan_status'].values.astype(int)


# Split data for five-fold
five_fold_data = []
for train_index, eval_index in kf.split(X_train):
x_train, x_eval = X_train[train_index], X_train[eval_index]
y_train, y_eval = Y_train[train_index], Y_train[eval_index]
five_fold_data.append([(x_train, y_train), (x_eval, y_eval)])

Algorithm

def get_model(param):
model_list = []
for idx, [(x_train, y_train), (x_eval, y_eval)] in enumeratefive_fold_data):
print('{}-th model is training:'.format(idx))
train_data = lgb.Dataset(x_train, label=y_train)
validation_data = lgb.Dataset(x_eval, label=y_eval)
bst = lgb.train(param, train_data, valid_sets=[validation_data])
model_list.append(bst)
return model_list

Train

param_base = {'num_leaves': 31, 'objective': 'binary', 'metric': 'binary', 'num_round':1000}

param_fine_tuning = {'num_thread': 8,'num_leaves': 128, 'metric': 'binary', 'objective': 'binary', 'num_round': 1000,
'learning_rate': 3e-3, 'feature_fraction': 0.6, 'bagging_fraction': 0.8}
param_fine_tuningfinal={'num_thread': 8,'num_leaves': 128, 'metric': 'binary', 'objective': 'binary', 'num_round': 900,
'learning_rate': 7e-3, 'feature_fraction': 0.8, 'bagging_fraction': 0.6,'max_depth':20,'min_sum_hessian_in_leaf':100}
# base param train
param_base_model = get_model(param_base)

# param fine tuning
param_fine_tuning_model = get_model(param_fine_tuning)
param_fine_tuningfinal_model = get_model(param_fine_tuningfinal)

Test

def test_model(model_list):
data = X_test
five_fold_pred = np.zeros((5, len(X_test)))
for i, bst in enumerate(model_list):
ypred = bst.predict(data, num_iteration=bst.best_iteration)
five_fold_pred[i] = ypred
ypred_mean = (five_fold_pred.mean(axis=-2)>0.5).astype(int)
return accuracy_score(ypred_mean, Y_test)
base_score = test_model(param_base_model)
fine_tuning_score = test_model(param_fine_tuning_model)
fine_tuningfinal_score = test_model(param_fine_tuningfinal_model)

print('base: {}, fine tuning: {}, fine tuning final: {}'.format(base_score, fine_tuning_score, fine_tuningfinal_score))

base: 0.9162, fine tuning: 0.91758, fine tuning final: 0.91844