Grid search CV 💪 Vs Randomize search CV 🦾
4 min readApr 17, 2022
For full code: https://www.kaggle.com/code/imnoob/gridsearchcv-vs-randomizesearchcv
Approach: The cross-validation used here is 3.
- Grid search generates evenly spaced values for each hyperparameter being tested, and then uses cross-validation to test the accuracy of each combination
- Random search generates random values for each hyperparameter being tested and then uses cross-validation to test the accuracy of each combination.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix
import itertoolsimport warnings
warnings.filterwarnings("ignore")pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 2000)
pd.set_option('display.max_colwidth', None)
pd.options.display.float_format = '{:.4f}'.format
Reading the Dataset
Dataset used: https://www.kaggle.com/datasets/dinhanhx/studentgradepassorfailprediction
df_student=pd.read_csv("../input/studentgradepassorfailprediction/student-mat-pass-or-fail.csv")
display(df_student.head())
Train test split
X=df_student.drop('pass',axis=1) # Features
y=df_student['pass'] # Labels# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=101)
Hyper Parameter tuning for Random Forest
rf_model = RandomForestClassifier()# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 80, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [2,4]
# Minimum number of samples required to split a node
min_samples_split = [2, 5]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2]
# Method of selecting samples for training each tree
bootstrap = [True, False]param_grid = {'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'bootstrap': bootstrap}
Grid search CV
rf_Grid = GridSearchCV(estimator = rf_model, param_grid = param_grid, cv = 3, verbose=2, n_jobs = 4)
rf_Grid.fit(X_train, y_train)
rf_Grid.best_params_
Randomize search CV
rf_random = RandomizedSearchCV(estimator = rf_model, param_distributions = param_grid,cv = 3, verbose=2, n_jobs = 4)
rf_random.fit(X_train, y_train)
rf_random.best_params_
Comparison between both the searches
print("For Grid search : {}".format(rf_Grid.best_params_),'\n\n',
"For Randomized search : {}".format(rf_random.best_params_))
Accuracy with both the hyperparameters
Grid Search
print (f'Train Accuracy - : {rf_Grid.score(X_train,y_train):.3f}')
print (f'Test Accuracy - : {rf_Grid.score(X_test,y_test):.3f}')
Randomize Search
print (f'Train Accuracy - : {rf_random.score(X_train,y_train):.3f}')
print (f'Test Accuracy - : {rf_random.score(X_test,y_test):.3f}')
Train the Random forest with the best hyperparameter
Grid Search CV
rf_model_grid = RandomForestClassifier(bootstrap = True, max_depth = 2,max_features = 'auto',min_samples_leaf = 1, min_samples_split = 2,
n_estimators = 64)
rf_model_grid.fit(X_train,y_train)
y_pred=rf_model_grid.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)
Confusion Metric
cf = confusion_matrix(y_test, y_pred)
plt.imshow(cf,cmap=plt.cm.Blues,interpolation='nearest')
plt.colorbar()
plt.title('Confusion Matrix without Normalization')
plt.xlabel('Predicted')
plt.ylabel('Actual')
tick_marks = np.arange(len(set(y_test))) # length of classes
class_labels = ['0','1']
tick_marks
plt.xticks(tick_marks,class_labels)
plt.yticks(tick_marks,class_labels)
# plotting text value inside cells
thresh = cf.max() / 2.
for i,j in itertools.product(range(cf.shape[0]),range(cf.shape[1])):
plt.text(j,i,format(cf[i,j],'d'),horizontalalignment='center',color='white' if cf[i,j] >thresh else 'black')
plt.show();
Randomize search CV
rf_model_randomize = RandomForestClassifier(n_estimators = 64, min_samples_split = 2, min_samples_leaf = 2,max_features = 'auto',
max_depth = 4, bootstrap = False)
rf_model_randomize.fit(X_train,y_train)
y_pred=rf_model_randomize.predict(X_test)
report = classification_report(y_test, y_pred)
print("The Classification report: \n {}".format(report))
Confusion Metric
cf = confusion_matrix(y_test, y_pred)
plt.imshow(cf,cmap=plt.cm.Blues,interpolation='nearest')
plt.colorbar()
plt.title('Confusion Matrix without Normalization')
plt.xlabel('Predicted')
plt.ylabel('Actual')
tick_marks = np.arange(len(set(y_test))) # length of classes
class_labels = ['0','1']
tick_marks
plt.xticks(tick_marks,class_labels)
plt.yticks(tick_marks,class_labels)
# plotting text value inside cells
thresh = cf.max() / 2.
for i,j in itertools.product(range(cf.shape[0]),range(cf.shape[1])):
plt.text(j,i,format(cf[i,j],'d'),horizontalalignment='center',color='white' if cf[i,j] >thresh else 'black')
plt.show();
Conclusion
This was a very small dataset so the result is quite similar but you can apply this technique with different datasets and find the difference.
If this article helped you don’t forget to like and share it with your friends👍Happy Learning!!