Case Study — House Price Prediction using Advanced Regresssion

apurv jain
6 min readSep 25, 2020

--

For more projects like this, please subscribe my blog ‘TopBlog’@http://naivedatascientist.co.in

Importing Important Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import norm
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

Reading the data.

housing=pd.read_csv("D:\\PYTHON_MAIN_COURSE(IIIT-Bnglr)\\COURSE-4-ML_PRED-ANALYSIS-2\\ASSIGNMENT\\train.csv")housing.shape
(1460, 81)
housing.head()

Analysing missing values

housing['GarageYrBlt']. replace (np.NaN, housing['GarageYrBlt'].median(), inplace=True)housing['MasVnrArea']. replace (np.NaN, housing['MasVnrArea'].median(), inplace=True)null_df=["Alley", "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2", "FireplaceQu", "GarageType", "GarageFinish",  "GarageQual", "GarageCond", "PoolQC", "Fence", "MiscFeature"]for i in null_df:
housing[i].fillna("none",inplace=True)
housing['SalePrice'].describe()count 1460.000000
mean 180921.195890
std 79442.502883
min 34900.000000
25% 129975.000000
50% 163000.000000
75% 214000.000000
max 755000.000000

UNIVARIATE ANALYSIS

plt.boxplot(housing['SalePrice'])
plt.show()
plt.figure(figsize=(8,8))
sns.distplot(housing['SalePrice'],rug=True)
plt.xlabel("Sale price",fontsize=25)
plt.show()
print("Skewness: %f" % housing['SalePrice'].skew())
print("Kurtosis: %f" % housing['SalePrice'].kurt())

Skewness: 1.882876
Kurtosis: 6.536282
plt.figure(figsize=(5,5))
sns.distplot(housing['LotArea'])
plt.xlabel("Lot Area",fontsize=25)
plt.show()
print("maximum homes have",round(housing['LotArea'].mean(),2)," sq ft size")
plt.figure(figsize=(5,5))
sns.distplot(housing['OverallQual'],rug=True)
plt.xlabel("overall Quality",fontsize=25)
plt.show()

plt.figure(figsize=(5,5)) sns.distplot(housing['OverallCond'],rug=True) plt.xlabel("Overall Condition of homes",fontsize=15) plt.show()

plt.figure(figsize=(8,8))
fig1=sns.barplot(x='MSZoning',y='SalePrice',estimator=np.median,data= housing)
fig1.set_xticklabels(['low density','medium density','all','floating villlage','high density' ],rotation=90,fontsize=12)
for i in fig1.patches:
# get_x pulls left or right; get_height pushes up or down
fig1.text(i.get_x()+.00, i.get_height()+120, \
str(round((i.get_height()), 0)), fontsize=21, color='black',
rotation=0)
plt.show()
plt.figure(figsize=(15,10))
fig2=sns.barplot(x='MSSubClass',y='SalePrice',estimator=np.median,data= housing)
for i in fig2.patches:
# get_x pulls left or right; get_height pushes up or down
fig2.text(i.get_x()-.05, i.get_height()+0, \
str(round((i.get_height()), 0)), fontsize=13, color='black',
rotation=0)
plt.show()
plt.figure(figsize=(8,8))
fig5=sns.barplot(x='BsmtQual',y='SalePrice',estimator=np.median,data= housing)
plt.xticks(fontsize=12)
plt.xlabel("Basement Height",fontsize=15)
plt.ylabel("Sale Price",fontsize=15)
plt.title("Basement Hight vs Sale Price",fontsize=20)
for i in fig5.patches:
# get_x pulls left or right; get_height pushes up or down
fig5.text(i.get_x()-.05, i.get_height()+0, \
str(round((i.get_height()), 0)), fontsize=15, color='black',
rotation=0)
plt.show()
fig6=sns.barplot(x='Heating',y='SalePrice',estimator=np.median,data= housing)
fig6.set_xticklabels(['Gas forced','hot water','Gravity furnace','wall Furnace','water/heat','floor furnace' ],rotation=90,fontsize=12)

plt.xticks(fontsize=12)
plt.xlabel("Heating Type",fontsize=15)
plt.ylabel("Sale Price",fontsize=15)
plt.title("Heating Type vs Sale Price",fontsize=20)
for i in fig6.patches:
# get_x pulls left or right; get_height pushes up or down
fig6.text(i.get_x()-.05, i.get_height()+0, \
str(round((i.get_height()), 0)), fontsize=15, color='black',
rotation=0)
plt.show()

Examinig numerical features.

numeric_cols=housing.select_dtypes(include=[np.number])
numeric_cols.head()

Finding Missing values.

In [36]:

total=numeric_cols.isnull().sum().sort_values(ascending=False)
percent=round(((numeric_cols.isnull().sum()/housing.shape[0])*100),2).sort_values(ascending=False)
missing=pd.concat([total,percent], axis=1, join='outer', keys=['Total Missing Values','% of Missing data'])
missing.index.name='features'
missing.head(10)

numeric_cols.apply(pd.Series.nunique)

MSSubClass         15
LotArea 1073
OverallQual 10
OverallCond 9
YearBuilt 112
YearRemodAdd 61
MasVnrArea 327
BsmtFinSF1 637
BsmtFinSF2 144
BsmtUnfSF 780
TotalBsmtSF 721
1stFlrSF 753
2ndFlrSF 417
LowQualFinSF 24
GrLivArea 861
BsmtFullBath 4
BsmtHalfBath 3
FullBath 4
HalfBath 3
BedroomAbvGr 8
KitchenAbvGr 4
TotRmsAbvGrd 12
Fireplaces 4
GarageYrBlt 97
GarageCars 5
GarageArea 441
WoodDeckSF 274
OpenPorchSF 202
EnclosedPorch 120
3SsnPorch 20
ScreenPorch 76
PoolArea 8
MiscVal 21
MoSold 12
YrSold 5
SalePrice 663
dtype: int64

Examining categorical features.

categ_cols=housing.select_dtypes(include=[np.object])
categ_cols.head()

Finding missing values.

total=categ_cols.isnull().sum().sort_values(ascending=False)
percent=round(((categ_cols.isnull().sum()/housing.shape[0])*100),2).sort_values(ascending=False)
missing_cat=pd.concat([total,percent], axis=1, join='outer', keys=['Total Missing Values','% of Missing data'])
missing_cat.index.name='features'
missing_cat.head(10)

Analysing correlations with target variable.

correl=numeric_cols.corr()
print(correl['SalePrice'].sort_values(ascending=False,),'\n')
SalePrice 1.000000
OverallQual 0.790982
GrLivArea 0.708624
GarageCars 0.640409
GarageArea 0.623431
TotalBsmtSF 0.613581
1stFlrSF 0.605852
FullBath 0.560664
TotRmsAbvGrd 0.533723
YearBuilt 0.522897
YearRemodAdd 0.507101
MasVnrArea 0.472614
Fireplaces 0.466929
GarageYrBlt 0.466754
BsmtFinSF1 0.386420
WoodDeckSF 0.324413
2ndFlrSF 0.319334
OpenPorchSF 0.315856
HalfBath 0.284108
LotArea 0.263843
BsmtFullBath 0.227122
BsmtUnfSF 0.214479
BedroomAbvGr 0.168213
ScreenPorch 0.111447
PoolArea 0.092404
MoSold 0.046432
3SsnPorch 0.044584
BsmtFinSF2 -0.011378
BsmtHalfBath -0.016844
MiscVal -0.021190
LowQualFinSF -0.025606
YrSold -0.028923
OverallCond -0.077856
MSSubClass -0.084284

Plotting heatmap for correlations.

plt.figure(figsize=(24,24))
corrmat=housing.corr()
sns.heatmap(corrmat,cmap='YlGnBu',annot=True)
plt.show()

Pairplots and scatter plots with highly correlated coulmns.

sns.set()
colz = ['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt']
for col in colz:
sns.lmplot(col,'SalePrice',data=housing)
plt.show()
sns.set()
columns=['SalePrice','OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt']
sns.pairplot(housing[columns])
plt.show()

probablity and normal probablity plot for Sale Price.

sns.distplot(housing['SalePrice'],fit=norm)
fig=plt.figure()
res=stats.probplot(housing['SalePrice'], plot=plt)

Tranformed histogram and Probablity plot for sale price.

sns.distplot(housing['SalePrice'],fit=norm)
fig=plt.figure()
res=stats.probplot(housing['SalePrice'], plot=plt)

Data Preparation.

Y=housing.pop('SalePrice')
X=housing
## creating dummy variables for categorical variables.
housing_cat=X.select_dtypes(include=['object'])
housing_cat.head()

Scaling the features.

from sklearn.preprocessing import scale
kols=X.columns
X=pd.DataFrame(scale(X))
X.columns=kols
X.columns
## Split into train and test
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)

Model Building and Evaluations.

Ridge and Lasso Regression.

## list of alphas
params = {'alpha': [0.0001, 0.001, 0.01, 0.05, 0.1,
0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 2.0, 3.0,
4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 20, 50, 100, 500, 1000 ]}
ridge=Ridge()
## Cross validation
folds=5
model_cv= GridSearchCV(estimator=ridge, param_grid=params ,scoring='neg_mean_absolute_error',
cv=folds,return_train_score=True,verbose=1)
model_cv.fit(x_train,y_train)
cv_results=pd.DataFrame(model_cv.cv_results_)
cv_results=cv_results[cv_results['param_alpha'] <=200]
cv_results.head()
cv_results['param_alpha'] = cv_results['param_alpha'].astype('int32')

# plotting
plt.plot(cv_results['param_alpha'], cv_results['mean_train_score'])
plt.plot(cv_results['param_alpha'], cv_results['mean_test_score'])
plt.xlabel('alpha')
plt.ylabel('Negative Mean Absolute Error')
plt.title("Negative Mean Absolute Error and alpha")
plt.legend(['train score', 'test score'], loc='upper left')
plt.show()
cv_results['param_alpha'] = cv_results['param_alpha'].astype('int32')

# plotting
plt.plot(cv_results['param_alpha'], cv_results['mean_train_score'])
plt.plot(cv_results['param_alpha'], cv_results['mean_test_score'])
plt.xlabel('alpha')
plt.ylabel('Negative Mean Absolute Error')
plt.title("Negative Mean Absolute Error and alpha")
plt.legend(['train score', 'test score'], loc='upper left')
plt.show()
model_cv.best_score_
-0.08733957975319036
model_cv.best_params_
'alpha': 100}
## Optimal value for alpha is 100 for ridge
alpha=100
ridge=Ridge(alpha)
ridge.fit(x_train,y_train)
ridge.coef_
y_train_pred=ridge.predict(x_train)
print(r2_score(y_true=y_train, y_pred=y_train_pred))
y_test_pred=ridge.predict(x_test)
print(r2_score(y_true=y_test, y_pred=y_test_pred))
0.9430880959227408
0.9182827529139239

r2 score is good for both train and test data so clearly no overfitting.

LASSO regression.

params = {'alpha': [0.0001,0.0002,0.0003,0.0004,0.0005,0.001, ]}


lasso= Lasso()
model_cv= GridSearchCV(estimator=lasso, param_grid=params , scoring='neg_mean_absolute_error',
cv=folds, return_train_score=True, verbose=1)
model_cv.fit(x_train,y_train)
cv_results = pd.DataFrame(model_cv.cv_results_)
cv_results.head()

# plotting mean test and train scoes with alpha
cv_results['param_alpha'] = cv_results['param_alpha'].astype('float32')

# plotting
plt.figure(figsize=(8,8))
plt.plot(cv_results['param_alpha'], cv_results['mean_train_score'])
plt.plot(cv_results['param_alpha'], cv_results['mean_test_score'])
plt.xlabel('alpha')
plt.ylabel('Negative Mean Absolute Error')

plt.title("Negative Mean Absolute Error and alpha")
plt.legend(['train score', 'test score'], loc='upper left')
plt.show()

model_cv.best_score_
-0.08373128974849563
model_cv.best_params_
{'alpha': 0.001}
alpha=0.001

lasso=Lasso(alpha=alpha)
lasso.fit(x_train,y_train)
# prediction
y_train_pred=ridge.predict(x_train)
print(r2_score(y_true=y_train, y_pred=y_train_pred))
y_test_pred=ridge.predict(x_test)
print(r2_score(y_true=y_test, y_pred=y_test_pred))

0.9430880959227408
0.9182827529139239
main_coef=pd.Series(lasso.coef_,index=x_train.columns)
var_selected= sum(lasso.coef_!=0)
print("the num of feautes selected by lasso : ",var_selected)
the num of feautes selected by lasso : 176main_features=pd.concat([main_coef.sort_values(ascending=False).head(10),main_coef.sort_values(ascending=False).tail(10)])
main_features
MSZoning_RL 0.116680
GrLivArea 0.081700
MSZoning_RM 0.074674
OverallQual 0.063872
YearBuilt 0.063544
MSZoning_FV 0.061964
OverallCond 0.050045
LotArea 0.035551
1stFlrSF 0.034417
2ndFlrSF 0.031540
Functional_Sev -0.008296
BldgType_Twnhs -0.008711
SaleType_WD -0.009701
Condition1_PosA -0.011586
Functional_Maj2 -0.012064
LandSlope_Sev -0.013436
KitchenAbvGr -0.014721
Heating_Grav -0.016029
KitchenQual_TA -0.032246
KitchenQual_Gd -0.032364

--

--

apurv jain

Graduated as a Marine Engineer from one of India’s most prestigious Engineering College (under IIT-JEE), having inclination toward Quantitative and Statistical