Data Analysis Practical Project based on Lending Club [1]
This practical project is based on the dataset from Lending Club
Dataset URL: https://github.com/H-Freax/lendingclub_analyse
This project is conducted in a Colab environment
Introduction
This data analysis practical project is divided into two parts. The first part mainly introduces the Baseline method based on LightGBM and three methods of adding derived variables, identifying four sets of derived variables that can improve the model’s performance. The second part focuses on data analysis using machine learning and deep learning network methods, practicing the integration of machine learning methods and the fusion of deep learning networks with machine learning methods.
Environment Preparation
The project uses LightGBM as the baseline. First, import the necessary packages.
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
Loading Data
seed = 42 # for the same data division
kf = KFold(n_splits=5, random_state=seed,shuffle=True)
df_train = pd.read_csv('train_final.csv')
df_test = pd.read_csv('test_final.csv')
Basic Data Inspection/Analysis
Inspect basic information of df_train.
df_train.describe()
To skip the one-hot encoding part for initial data analysis, list all column names using the following function:
df_train.columns.values
Exclude the one-hot encoded variables for visualization and observe patterns:
import matplotlib.pyplot as plt
onehotlabels=[...]
showdf_train=df_train.drop(columns=onehotlabels)
showdf_train.hist(bins=50,figsize=(20,15))
plt.show()
As ‘continuous_fico_range’ and ‘continuous_last_fico_range’ have upper and lower bounds and are highly correlated, we remove the ‘high’ part for further visualization analysis.
from pandas.plotting import scatter_matrix
scatter_matrix(showdf_train.drop(columns=['continuous_fico_range_high','continuous_last_fico_range_high']),figsize=(40,35))
Baseline
Data Preprocessing
X_train = df_train.drop(columns=['loan_status']).values
Y_train = df_train['loan_status'].values.astype(int)
X_test = df_test.drop(columns=['loan_status']).values
Y_test = df_test['loan_status'].values.astype(int)
# split data for five fold
five_fold_data = []
for train_index, eval_index in kf.split(X_train):
x_train, x_eval = X_train[train_index], X_train[eval_index]
y_train, y_eval = Y_train[train_index], Y_train[eval_index]
five_fold_data.append([(x_train, y_train), (x_eval, y_eval)])
X_train.shape, Y_train.shape
Algorithm
def get_model(param):
model_list = []
for idx, [(x_train, y_train), (x_eval, y_eval)] in enumerate(five_fold_data):
print('{}-th model is training:'.format(idx))
train_data = lgb.Dataset(x_train, label=y_train)
validation_data = lgb.Dataset(x_eval, label=y_eval)
bst = lgb.train(param, train_data, valid_sets=[validation_data])
model_list.append(bst)
return model_list
Train
param_base = {'num_leaves': 31, 'objective': 'binary', 'metric': 'binary', 'num_round':1000}
param_fine_tuning = {'num_thread': 8,'num_leaves': 128, 'metric': 'binary', 'objective': 'binary', 'num_round': 1000,
'learning_rate': 3e-3, 'feature_fraction': 0.6, 'bagging_fraction': 0.8}
# base param train
param_base_model = get_model(param_base)
# param fine tuning
param_fine_tuning_model = get_model(param_fine_tuning)
Test
def test_model(model_list):
data = X_test
five_fold_pred = np.zeros((5, len(X_test)))
for i, bst in enumerate(model_list):
ypred = bst.predict(data, num_iteration=bst.best_iteration)
five_fold_pred[i] = ypred
ypred_mean = (five_fold_pred.mean(axis=-2)>0.5).astype(int)
return accuracy_score(ypred_mean, Y_test)
base_score = test_model(param_base_model)
fine_tuning_score = test_model(param_fine_tuning_model)
print('base: {}, fine tuning: {}'.format(base_score, fine_tuning_score))
Adding Derived Variables
CatBoostEncoder
Import the required environment.
pip install category_encoders
import category_encoders as ce #Package for CatBoostEncoder
#Create the encoder
target_enc = ce.CatBoostEncoder(cols='continuous_open_acc')
target_enc.fit(df_train['continuous_open_acc'], df_train['loan_status'])
#Transform the features, rename columns with _cb suffix, and join to dataframe
train_CBE = df_train.join(target_enc.transform(df_train['continuous_open_acc']).add_suffix('_cb'))
test_CBE = df_test.join(target_enc.transform(df_test['continuous_open_acc']).add_suffix('_cb'))
Data Preprocessing
X_train = train_CBE.drop(columns=['loan_status']).values
Y_train = train_CBE['loan_status'].values.astype(int)
X_test = test_CBE.drop(columns=['loan_status']).values
Y_test = test_CBE['loan_status'].values.astype(int)
# split data for five foldfive_fold_data = []for train_index, eval_index in kf.split(X_train):
x_train, x_eval = X_train[train_index], X_train[eval_index]
y_train, y_eval = Y_train[train_index], Y_train[eval_index] five_fold_data.append([(x_train, y_train), (x_eval, y_eval)])
Algorithm
def get_model(param):
model_list = []
for idx, [(x_train, y_train), (x_eval, y_eval)] in enumerate(five_fold_data):
print('{}-th model is training:'.format(idx))
train_data = lgb.Dataset(x_train, label=y_train)
validation_data = lgb.Dataset(x_eval, label=y_eval)
bst = lgb.train(param, train_data, valid_sets=[validation_data])
model_list.append(bst)
return model_list
Train
param_base = {'num_leaves': 31, 'objective': 'binary', 'metric': 'binary', 'num_round':1000}
param_fine_tuning = {'num_thread': 8,'num_leaves': 128, 'metric': 'binary', 'objective': 'binary', 'num_round': 1000,
'learning_rate': 3e-3, 'feature_fraction': 0.6, 'bagging_fraction': 0.8}param_fine_tuningfinal={'num_thread': 8,'num_leaves': 128, 'metric': 'binary', 'objective': 'binary', 'num_round': 1200,
'learning_rate': 3e-3, 'feature_fraction': 0.6, 'bagging_fraction': 0.8}
# base param train
param_base_model = get_model(param_base)
# param fine tuning
param_fine_tuning_model = get_model(param_fine_tuning)
param_fine_tuningfinal_model = get_model(param_fine_tuningfinal)
Test
def test_model(model_list):
data is X_test
five_fold_pred = np.zeros((5, len(X_test)))
for i, bst in enumerate(model_list):
ypred = bst.predict(data, num_iteration=bst.best_iteration)
five_fold_pred[i] = ypred
ypred_mean = (five_fold_pred.mean(axis=-2)>0.5).astype(int)
return accuracy_score(ypred_mean, Y_test)
base_score = test_model(param_base_model)
fine_tuning_score = test_model(param_fine_tuning_model)
fine_tuningfinal_score=test_model(param_fine_tuningfinal_model)
print('base: {}, fine tuning: {}, fine tuning final: {}'.format(base_score, fine_tuning_score, fine_tuningfinal_score))
base: 0.91568, fine tuning: 0.91774, fine tuning final: 0.91796
Discretization
Based onClustering for ‘continuous_open_acc’
df_train.groupby('continuous_open_acc')['continuous_open_acc'].unique()
!pip install KMeans
from sklearn.cluster import KMeans
ddtrain = df_train['continuous_open_acc']
ddtest = df_test['continuous_open_acc']
data_reshape1 = ddtrain.values.reshape((ddtrain.shape[0],1))
model_kmeans = KMeans(n_clusters=5, random_state=0)
kmeans_result = model_kmeans.fit_predict(data_reshape1)
traina = kmeans_result
data_reshape2 = ddtest.values.reshape((ddtest.shape[0],1))
model_kmeans = KMeans(n_clusters=5, random_state=0)
kmeans_result = model_kmeans.fit_predict(data_reshape2)
testa = kmeans_result
train_KM = df_train.copy()
test_KM = df_test.copy()
train_KM['continuous_open_acc_km'] = traina
test_KM['continuous_open_acc_km'] = testa
Data Preprocessing
X_train = train_KM.drop(columns=['loan_status']).values
Y_train = train_KM['loan_status'].values.astype(int)
X_test = test_KM.drop(columns=['loan_status']).values
Y_test = test_KM['loan_status'].values.astype(int)
# Split data for five-fold
five_fold_data = []
for train_index, eval_index in kf.split(X_train):
x_train, x_eval = X_train[train_index], X_train[eval_index]
y_train, y_eval = Y_train[train_index], Y_train[eval_index]
five_fold_data.append([(x_train, y_train), (x_eval, y_eval)])
Algorithm
def get_model(param):
model_list = []
for idx, [(x_train, y_train), (x_eval, y_eval)] in enumerate(five_fold_data):
print('{}-th model is training:'.format(idx))
train_data = lgb.Dataset(x_train, label=y_train)
validation_data = lgb.Dataset(x_eval, label=y_eval)
bst = lgb.train(param, train_data, valid_sets=[validation_data])
model_list.append(bst)
return model_list
Train
param_base = {'num_leaves': 31, 'objective': 'binary', 'metric': 'binary', 'num_round':1000}
param_fine_tuning = {'num_thread': 8,'num_leaves': 128, 'metric': 'binary', 'objective': 'binary', 'num_round': 1000,
'learning_rate': 3e-3, 'feature_fraction': 0.6, 'bagging_fraction': 0.8}
param_fine_tuningfinal={'num_thread': 8,'num_leaves': 128, 'metric': 'binary', 'objective': 'binary', 'num_round': 800,
'learning_rate': 6e-3, 'feature_fraction': 0.8, 'bagging_fraction': 0.6,'boosting':'goss','tree_learning':'feature','max_depth':20,'min_sum_hessian_in_leaf':100}
# base param train
param_base_model = get_model(param_base)
# param fine tuning
param_fine_tuning_model = get_model(param_fine_tuning)
param_fine_tuningfinal_model = get_model(param_fine_tuningfinal)
Test
def test_model(model_list):
data = X_test
five_fold_pred = np.zeros((5, len(X_test)))
for i, bst in enumerate(model_list):
ypred = bst.predict(data, num_iteration=bst.best_iteration)
five_fold_pred[i] = ypred
ypred_mean = (five_fold_pred.mean(axis=-2)>0.5).astype(int)
return accuracy_score(ypred_mean, Y_test)
base_score = test_model(param_base_model)
fine_tuning_score = test_model(param_fine_tuning_model)
fine_tuningfinal_score = test_model(param_fine_tuningfinal_model)
print('base: {}, fine tuning: {}, fine tuning final: {}'.format(base_score, fine_tuning_score, fine_tuningfinal_score))
base: 0.91598, fine tuning: 0.91776, fine tuning final: 0.91874
Using Exponential Interval Division for ‘continuous_loan_amnt’
train_ZQ = df_train.copy()
test_ZQ = df_test.copy()
trainbins = np.floor(np.log10(train_ZQ['continuous_loan_amnt'])) # Take logarithm and then floor
testbins = np.floor(np.log10(test_ZQ['continuous
_loan_amnt']))
train_ZQ['continuous_loan_amnt_km'] = trainbins
test_ZQ['continuous_loan_amnt_km'] = testbins
Data Preprocessing
X_train = train_ZQ.drop(columns=['loan_status']).values
Y_train = train_ZQ['loan_status'].values.astype(int)
X_test = test_ZQ.drop(columns=['loan_status']).values
Y_test = test_ZQ['loan_status'].values.astype(int)
# Split data for five-fold
five_fold_data = []
for train_index, eval_index in kf.split(X_train):
x_train, x_eval = X_train[train_index], X_train[eval_index]
y_train, y_eval = Y_train[train_index], Y_train[eval_index]
five_fold_data.append([(x_train, y_train), (x_eval, y_eval)])
Algorithm
def get_model(param):
model_list = []
for idx, [(x_train, y_train), (x_eval, y_eval)] in enumerate(five_fold_data):
print('{}-th model is training:'.format(idx))
train_data = lgb.Dataset(x_train, label=y_train)
validation_data = lgb.Dataset(x_eval, label=y_eval)
bst is lgb.train(param, train_data, valid_sets=[validation_data])
model_list.append(bst)
return model_list
Train
param_base = {'num_leaves': 31, 'objective': 'binary', 'metric': 'binary', 'num_round':1000}
param_fine_tuning = {'num_thread': 8,'num_leaves': 128, 'metric': 'binary', 'objective': 'binary', 'num_round': 1000,
'learning_rate': 3e-3, 'feature_fraction': 0.6, 'bagging_fraction': 0.8}
param_fine_tuningfinal={'num_thread': 8,'num_leaves': 128, 'metric': 'binary', 'objective': 'binary', 'num_round': 900,
'learning_rate': 7e-3, 'feature_fraction': 0.8, 'bagging_fraction': 0.6,'max_depth':20,'min_sum_hessian_in_leaf':100}
# base param train
param_base_model = get_model(param_base)
# param fine tuning
param_fine_tuning_model = get_model(param_fine_tuning)
param_fine_tuningfinal_model = get_model(param_fine_tuningfinal)
Test
def test_model(model_list):
data is X_test
five_fold_pred = np.zeros((5, len(X_test)))
for i, bst in enumerate(model_list):
ypred = bst.predict(data, num_iteration=bst.best_iteration)
five_fold_pred[i] = ypred
ypred_mean = (five_fold_pred.mean(axis=-2)>0.5).astype(int)
return accuracy_score(ypred_mean, Y_test)
base_score = test_model(param_base_model)
fine_tuning_score = test_model(param_fine_tuning_model)
fine_tuningfinal_score = test_model(param_fine_tuningfinal_model)
print('base: {}, fine tuning: {}, fine tuning final: {}'.format(base_score, fine_tuning_score, fine_tuningfinal_score))
base: 0.91586, fine tuning: 0.91764, fine tuning final: 0.91842
Derived Variables Based on Business Logic Analysis
train_YW = df_train.copy()
test_YW = df_test.copy()
train_YW['installment_feat'] = train_YW['continuous_installment'] / ((train_YW['continuous_annual_inc']+1) / 12)
test_YW['installment_feat'] = test_YW['continuous_installment'] / ((test_YW['continuous_annual_inc']+1) / 12)
Data Preprocessing
X_train = train_YW.drop(columns=['loan_status']).values
Y_train = train_YW['loan_status'].values.astype(int)
X_test = test_YW.drop(columns=['loan_status']).values
Y_test = test_YW['loan_status'].values.astype(int)
# Split data for five-fold
five_fold_data = []
for train_index, eval_index in kf.split(X_train):
x_train, x_eval = X_train[train_index], X_train[eval_index]
y_train, y_eval = Y_train[train_index], Y_train[eval_index]
five_fold_data.append([(x_train, y_train), (x_eval, y_eval)])
Algorithm
def get_model(param):
model_list = []
for idx, [(x_train, y_train), (x_eval, y_eval)] in enumeratefive_fold_data):
print('{}-th model is training:'.format(idx))
train_data = lgb.Dataset(x_train, label=y_train)
validation_data = lgb.Dataset(x_eval, label=y_eval)
bst = lgb.train(param, train_data, valid_sets=[validation_data])
model_list.append(bst)
return model_list
Train
param_base = {'num_leaves': 31, 'objective': 'binary', 'metric': 'binary', 'num_round':1000}
param_fine_tuning = {'num_thread': 8,'num_leaves': 128, 'metric': 'binary', 'objective': 'binary', 'num_round': 1000,
'learning_rate': 3e-3, 'feature_fraction': 0.6, 'bagging_fraction': 0.8}
param_fine_tuningfinal={'num_thread': 8,'num_leaves': 128, 'metric': 'binary', 'objective': 'binary', 'num_round': 900,
'learning_rate': 7e-3, 'feature_fraction': 0.8, 'bagging_fraction': 0.6,'max_depth':20,'min_sum_hessian_in_leaf':100}
# base param train
param_base_model = get_model(param_base)
# param fine tuning
param_fine_tuning_model = get_model(param_fine_tuning)
param_fine_tuningfinal_model = get_model(param_fine_tuningfinal)
Test
def test_model(model_list):
data = X_test
five_fold_pred = np.zeros((5, len(X_test)))
for i, bst in enumerate(model_list):
ypred = bst.predict(data, num_iteration=bst.best_iteration)
five_fold_pred[i] = ypred
ypred_mean = (five_fold_pred.mean(axis=-2)>0.5).astype(int)
return accuracy_score(ypred_mean, Y_test)
base_score = test_model(param_base_model)
fine_tuning_score = test_model(param_fine_tuning_model)
fine_tuningfinal_score = test_model(param_fine_tuningfinal_model)
print('base: {}, fine tuning: {}, fine tuning final: {}'.format(base_score, fine_tuning_score, fine_tuningfinal_score))
base: 0.9162, fine tuning: 0.91758, fine tuning final: 0.91844