Grid search CV 💪 Vs Randomize search CV 🦾

Ravi Kumar
4 min readApr 17, 2022

--

For full code: https://www.kaggle.com/code/imnoob/gridsearchcv-vs-randomizesearchcv

Approach: The cross-validation used here is 3.

  • Grid search generates evenly spaced values for each hyperparameter being tested, and then uses cross-validation to test the accuracy of each combination
  • Random search generates random values for each hyperparameter being tested and then uses cross-validation to test the accuracy of each combination.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix
import itertools
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 2000)
pd.set_option('display.max_colwidth', None)
pd.options.display.float_format = '{:.4f}'.format

Reading the Dataset

Dataset used: https://www.kaggle.com/datasets/dinhanhx/studentgradepassorfailprediction

df_student=pd.read_csv("../input/studentgradepassorfailprediction/student-mat-pass-or-fail.csv")
display(df_student.head())
DataFrame

Train test split

X=df_student.drop('pass',axis=1)  # Features
y=df_student['pass'] # Labels
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=101)

Hyper Parameter tuning for Random Forest

rf_model = RandomForestClassifier()# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 80, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [2,4]
# Minimum number of samples required to split a node
min_samples_split = [2, 5]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2]
# Method of selecting samples for training each tree
bootstrap = [True, False]
param_grid = {'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'bootstrap': bootstrap}

Grid search CV

rf_Grid = GridSearchCV(estimator = rf_model, param_grid = param_grid, cv = 3, verbose=2, n_jobs = 4)
rf_Grid.fit(X_train, y_train)
rf_Grid.best_params_
Best parameters

Randomize search CV

rf_random = RandomizedSearchCV(estimator = rf_model, param_distributions = param_grid,cv = 3, verbose=2, n_jobs = 4)
rf_random.fit(X_train, y_train)
rf_random.best_params_
Best parameter

Comparison between both the searches

print("For Grid search : {}".format(rf_Grid.best_params_),'\n\n',
"For Randomized search : {}".format(rf_random.best_params_))

Accuracy with both the hyperparameters

Grid Search

print (f'Train Accuracy - : {rf_Grid.score(X_train,y_train):.3f}')
print (f'Test Accuracy - : {rf_Grid.score(X_test,y_test):.3f}')

Randomize Search

print (f'Train Accuracy - : {rf_random.score(X_train,y_train):.3f}')
print (f'Test Accuracy - : {rf_random.score(X_test,y_test):.3f}')

Train the Random forest with the best hyperparameter

Grid Search CV

rf_model_grid = RandomForestClassifier(bootstrap = True, max_depth = 2,max_features = 'auto',min_samples_leaf = 1, min_samples_split = 2,
n_estimators = 64)
rf_model_grid.fit(X_train,y_train)
y_pred=rf_model_grid.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)

Confusion Metric

cf = confusion_matrix(y_test, y_pred)
plt.imshow(cf,cmap=plt.cm.Blues,interpolation='nearest')
plt.colorbar()
plt.title('Confusion Matrix without Normalization')
plt.xlabel('Predicted')
plt.ylabel('Actual')
tick_marks = np.arange(len(set(y_test))) # length of classes
class_labels = ['0','1']
tick_marks
plt.xticks(tick_marks,class_labels)
plt.yticks(tick_marks,class_labels)
# plotting text value inside cells
thresh = cf.max() / 2.
for i,j in itertools.product(range(cf.shape[0]),range(cf.shape[1])):
plt.text(j,i,format(cf[i,j],'d'),horizontalalignment='center',color='white' if cf[i,j] >thresh else 'black')
plt.show();

Randomize search CV

rf_model_randomize = RandomForestClassifier(n_estimators = 64, min_samples_split = 2, min_samples_leaf = 2,max_features = 'auto',
max_depth = 4, bootstrap = False)
rf_model_randomize.fit(X_train,y_train)
y_pred=rf_model_randomize.predict(X_test)
report = classification_report(y_test, y_pred)
print("The Classification report: \n {}".format(report))

Confusion Metric

cf = confusion_matrix(y_test, y_pred)
plt.imshow(cf,cmap=plt.cm.Blues,interpolation='nearest')
plt.colorbar()
plt.title('Confusion Matrix without Normalization')
plt.xlabel('Predicted')
plt.ylabel('Actual')
tick_marks = np.arange(len(set(y_test))) # length of classes
class_labels = ['0','1']
tick_marks
plt.xticks(tick_marks,class_labels)
plt.yticks(tick_marks,class_labels)
# plotting text value inside cells
thresh = cf.max() / 2.
for i,j in itertools.product(range(cf.shape[0]),range(cf.shape[1])):
plt.text(j,i,format(cf[i,j],'d'),horizontalalignment='center',color='white' if cf[i,j] >thresh else 'black')
plt.show();

Conclusion

This was a very small dataset so the result is quite similar but you can apply this technique with different datasets and find the difference.

If this article helped you don’t forget to like and share it with your friends👍Happy Learning!!

--

--

Ravi Kumar

Hey Techies!, I am here to share my learning about different tech, tools, business and automations, for more details: https://linktr.ee/ravikumar10593