Case Study — House Price Prediction using Advanced Regresssion

6 min readSep 25, 2020

For more projects like this, please subscribe my blog ‘TopBlog’@http://naivedatascientist.co.in

Importing Important Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import norm
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

Reading the data.

housing=pd.read_csv("D:\\PYTHON_MAIN_COURSE(IIIT-Bnglr)\\COURSE-4-ML_PRED-ANALYSIS-2\\ASSIGNMENT\\train.csv")housing.shape
(1460, 81)housing.head()

Analysing missing values

housing['GarageYrBlt']. replace (np.NaN, housing['GarageYrBlt'].median(), inplace=True)housing['MasVnrArea']. replace (np.NaN, housing['MasVnrArea'].median(), inplace=True)null_df=["Alley", "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2", "FireplaceQu", "GarageType", "GarageFinish",  "GarageQual", "GarageCond", "PoolQC", "Fence", "MiscFeature"]for i in null_df:
    housing[i].fillna("none",inplace=True)housing['SalePrice'].describe()count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000

UNIVARIATE ANALYSIS

plt.boxplot(housing['SalePrice'])
plt.show()

plt.figure(figsize=(8,8))
sns.distplot(housing['SalePrice'],rug=True)
plt.xlabel("Sale price",fontsize=25)
plt.show()

print("Skewness: %f" % housing['SalePrice'].skew())
print("Kurtosis: %f" % housing['SalePrice'].kurt())

Skewness: 1.882876
Kurtosis: 6.536282plt.figure(figsize=(5,5))
sns.distplot(housing['LotArea'])
plt.xlabel("Lot Area",fontsize=25)
plt.show()
print("maximum homes have",round(housing['LotArea'].mean(),2)," sq ft size")

plt.figure(figsize=(5,5))
sns.distplot(housing['OverallQual'],rug=True)
plt.xlabel("overall Quality",fontsize=25)
plt.show()

plt.figure(figsize=(5,5)) sns.distplot(housing['OverallCond'],rug=True) plt.xlabel("Overall Condition of homes",fontsize=15) plt.show()

plt.figure(figsize=(8,8))
fig1=sns.barplot(x='MSZoning',y='SalePrice',estimator=np.median,data= housing)
fig1.set_xticklabels(['low density','medium density','all','floating villlage','high density' ],rotation=90,fontsize=12)
for i in fig1.patches:
    # get_x pulls left or right; get_height pushes up or down
    fig1.text(i.get_x()+.00, i.get_height()+120, \
            str(round((i.get_height()), 0)), fontsize=21, color='black',
                rotation=0)
plt.show()

plt.figure(figsize=(15,10))
fig2=sns.barplot(x='MSSubClass',y='SalePrice',estimator=np.median,data= housing)
for i in fig2.patches:
    # get_x pulls left or right; get_height pushes up or down
    fig2.text(i.get_x()-.05, i.get_height()+0, \
            str(round((i.get_height()), 0)), fontsize=13, color='black',
                rotation=0)
plt.show()

plt.figure(figsize=(8,8))
fig5=sns.barplot(x='BsmtQual',y='SalePrice',estimator=np.median,data= housing)
plt.xticks(fontsize=12)
plt.xlabel("Basement Height",fontsize=15)
plt.ylabel("Sale Price",fontsize=15)
plt.title("Basement Hight vs Sale Price",fontsize=20)
for i in fig5.patches:
    # get_x pulls left or right; get_height pushes up or down
    fig5.text(i.get_x()-.05, i.get_height()+0, \
            str(round((i.get_height()), 0)), fontsize=15, color='black',
                rotation=0)
plt.show()

fig6=sns.barplot(x='Heating',y='SalePrice',estimator=np.median,data= housing)
fig6.set_xticklabels(['Gas forced','hot water','Gravity furnace','wall Furnace','water/heat','floor furnace' ],rotation=90,fontsize=12)

plt.xticks(fontsize=12)
plt.xlabel("Heating Type",fontsize=15)
plt.ylabel("Sale Price",fontsize=15)
plt.title("Heating Type vs Sale Price",fontsize=20)
for i in fig6.patches:
    # get_x pulls left or right; get_height pushes up or down
    fig6.text(i.get_x()-.05, i.get_height()+0, \
            str(round((i.get_height()), 0)), fontsize=15, color='black',
                rotation=0)
plt.show()

Examinig numerical features.

numeric_cols=housing.select_dtypes(include=[np.number])
numeric_cols.head()

Finding Missing values.

In [36]:

total=numeric_cols.isnull().sum().sort_values(ascending=False)
percent=round(((numeric_cols.isnull().sum()/housing.shape[0])*100),2).sort_values(ascending=False)
missing=pd.concat([total,percent], axis=1, join='outer', keys=['Total Missing Values','% of Missing data'])
missing.index.name='features'
missing.head(10)

numeric_cols.apply(pd.Series.nunique)

MSSubClass         15
LotArea          1073
OverallQual        10
OverallCond         9
YearBuilt         112
YearRemodAdd       61
MasVnrArea        327
BsmtFinSF1        637
BsmtFinSF2        144
BsmtUnfSF         780
TotalBsmtSF       721
1stFlrSF          753
2ndFlrSF          417
LowQualFinSF       24
GrLivArea         861
BsmtFullBath        4
BsmtHalfBath        3
FullBath            4
HalfBath            3
BedroomAbvGr        8
KitchenAbvGr        4
TotRmsAbvGrd       12
Fireplaces          4
GarageYrBlt        97
GarageCars          5
GarageArea        441
WoodDeckSF        274
OpenPorchSF       202
EnclosedPorch     120
3SsnPorch          20
ScreenPorch        76
PoolArea            8
MiscVal            21
MoSold             12
YrSold              5
SalePrice         663
dtype: int64

Examining categorical features.

categ_cols=housing.select_dtypes(include=[np.object])
categ_cols.head()

Finding missing values.

total=categ_cols.isnull().sum().sort_values(ascending=False)
percent=round(((categ_cols.isnull().sum()/housing.shape[0])*100),2).sort_values(ascending=False)
missing_cat=pd.concat([total,percent], axis=1, join='outer', keys=['Total Missing Values','% of Missing data'])
missing_cat.index.name='features'
missing_cat.head(10)

Analysing correlations with target variable.

correl=numeric_cols.corr()
print(correl['SalePrice'].sort_values(ascending=False,),'\n')SalePrice        1.000000
OverallQual      0.790982
GrLivArea        0.708624
GarageCars       0.640409
GarageArea       0.623431
TotalBsmtSF      0.613581
1stFlrSF         0.605852
FullBath         0.560664
TotRmsAbvGrd     0.533723
YearBuilt        0.522897
YearRemodAdd     0.507101
MasVnrArea       0.472614
Fireplaces       0.466929
GarageYrBlt      0.466754
BsmtFinSF1       0.386420
WoodDeckSF       0.324413
2ndFlrSF         0.319334
OpenPorchSF      0.315856
HalfBath         0.284108
LotArea          0.263843
BsmtFullBath     0.227122
BsmtUnfSF        0.214479
BedroomAbvGr     0.168213
ScreenPorch      0.111447
PoolArea         0.092404
MoSold           0.046432
3SsnPorch        0.044584
BsmtFinSF2      -0.011378
BsmtHalfBath    -0.016844
MiscVal         -0.021190
LowQualFinSF    -0.025606
YrSold          -0.028923
OverallCond     -0.077856
MSSubClass      -0.084284

Plotting heatmap for correlations.

plt.figure(figsize=(24,24))
corrmat=housing.corr()
sns.heatmap(corrmat,cmap='YlGnBu',annot=True)
plt.show()

Pairplots and scatter plots with highly correlated coulmns.

sns.set()
colz = ['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt']
for col in colz:
    sns.lmplot(col,'SalePrice',data=housing)
plt.show()

sns.set()
columns=['SalePrice','OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt']
sns.pairplot(housing[columns])
plt.show()

probablity and normal probablity plot for Sale Price.

sns.distplot(housing['SalePrice'],fit=norm)
fig=plt.figure()
res=stats.probplot(housing['SalePrice'], plot=plt)

Tranformed histogram and Probablity plot for sale price.

sns.distplot(housing['SalePrice'],fit=norm)
fig=plt.figure()
res=stats.probplot(housing['SalePrice'], plot=plt)

Data Preparation.

Y=housing.pop('SalePrice')
X=housing## creating dummy variables for categorical variables.
housing_cat=X.select_dtypes(include=['object'])
housing_cat.head()

Scaling the features.

from sklearn.preprocessing import scale
kols=X.columns
X=pd.DataFrame(scale(X))
X.columns=kols
X.columns## Split into train and test
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)

Model Building and Evaluations.

Ridge and Lasso Regression.

## list of alphas
params = {'alpha': [0.0001, 0.001, 0.01, 0.05, 0.1, 
 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 2.0, 3.0, 
 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 20, 50, 100, 500, 1000 ]}ridge=Ridge()
## Cross validation
folds=5
model_cv= GridSearchCV(estimator=ridge, param_grid=params ,scoring='neg_mean_absolute_error',
                       cv=folds,return_train_score=True,verbose=1)
model_cv.fit(x_train,y_train)cv_results=pd.DataFrame(model_cv.cv_results_)
cv_results=cv_results[cv_results['param_alpha'] <=200]
cv_results.head()cv_results['param_alpha'] = cv_results['param_alpha'].astype('int32')

# plotting
plt.plot(cv_results['param_alpha'], cv_results['mean_train_score'])
plt.plot(cv_results['param_alpha'], cv_results['mean_test_score'])
plt.xlabel('alpha')
plt.ylabel('Negative Mean Absolute Error')
plt.title("Negative Mean Absolute Error and alpha")
plt.legend(['train score', 'test score'], loc='upper left')
plt.show()cv_results['param_alpha'] = cv_results['param_alpha'].astype('int32')

# plotting
plt.plot(cv_results['param_alpha'], cv_results['mean_train_score'])
plt.plot(cv_results['param_alpha'], cv_results['mean_test_score'])
plt.xlabel('alpha')
plt.ylabel('Negative Mean Absolute Error')
plt.title("Negative Mean Absolute Error and alpha")
plt.legend(['train score', 'test score'], loc='upper left')
plt.show()

model_cv.best_score_
-0.08733957975319036model_cv.best_params_
'alpha': 100}## Optimal value for alpha is 100 for ridge
alpha=100
ridge=Ridge(alpha)
ridge.fit(x_train,y_train)
ridge.coef_y_train_pred=ridge.predict(x_train)
print(r2_score(y_true=y_train, y_pred=y_train_pred))
y_test_pred=ridge.predict(x_test)
print(r2_score(y_true=y_test, y_pred=y_test_pred))
0.9430880959227408
0.9182827529139239

r2 score is good for both train and test data so clearly no overfitting.

LASSO regression.

params = {'alpha': [0.0001,0.0002,0.0003,0.0004,0.0005,0.001, ]}


lasso= Lasso()
model_cv= GridSearchCV(estimator=lasso, param_grid=params , scoring='neg_mean_absolute_error',
                       cv=folds, return_train_score=True, verbose=1)
model_cv.fit(x_train,y_train)cv_results = pd.DataFrame(model_cv.cv_results_)
cv_results.head()

# plotting mean test and train scoes with alpha
cv_results['param_alpha'] = cv_results['param_alpha'].astype('float32')

# plotting
plt.figure(figsize=(8,8))
plt.plot(cv_results['param_alpha'], cv_results['mean_train_score'])
plt.plot(cv_results['param_alpha'], cv_results['mean_test_score'])
plt.xlabel('alpha')
plt.ylabel('Negative Mean Absolute Error')

plt.title("Negative Mean Absolute Error and alpha")
plt.legend(['train score', 'test score'], loc='upper left')
plt.show()

model_cv.best_score_
-0.08373128974849563model_cv.best_params_
{'alpha': 0.001}alpha=0.001

lasso=Lasso(alpha=alpha)
lasso.fit(x_train,y_train)# prediction
y_train_pred=ridge.predict(x_train)
print(r2_score(y_true=y_train, y_pred=y_train_pred))
y_test_pred=ridge.predict(x_test)
print(r2_score(y_true=y_test, y_pred=y_test_pred))

0.9430880959227408
0.9182827529139239main_coef=pd.Series(lasso.coef_,index=x_train.columns)
var_selected= sum(lasso.coef_!=0)
print("the num of feautes selected by lasso : ",var_selected)the num of feautes selected by lasso :  176main_features=pd.concat([main_coef.sort_values(ascending=False).head(10),main_coef.sort_values(ascending=False).tail(10)])
main_featuresMSZoning_RL        0.116680
GrLivArea          0.081700
MSZoning_RM        0.074674
OverallQual        0.063872
YearBuilt          0.063544
MSZoning_FV        0.061964
OverallCond        0.050045
LotArea            0.035551
1stFlrSF           0.034417
2ndFlrSF           0.031540
Functional_Sev    -0.008296
BldgType_Twnhs    -0.008711
SaleType_WD       -0.009701
Condition1_PosA   -0.011586
Functional_Maj2   -0.012064
LandSlope_Sev     -0.013436
KitchenAbvGr      -0.014721
Heating_Grav      -0.016029
KitchenQual_TA    -0.032246
KitchenQual_Gd    -0.032364