Boruta Feature Selection Explained in Python

Implementation and explanation from scratch

Boruta Feature Selection

Need for Feature Selection?

Boruta Algorithm


Load and process data

# important librariesimport pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import scipy as sp
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
data =  pd.read_csv("healthcare-dataset-stroke-data.csv")
Heart Stroke Dataset
# converting to numericdata["gender"] = pd.factorize(data["gender"])[0]
data["ever_married"] = pd.factorize(data["ever_married"])[0]
data["work_type"] = pd.factorize(data["work_type"])[0]
data["Residence_type"] = pd.factorize(data["Residence_type"])[0]
data["smoking_status"] = pd.factorize(data["smoking_status"])[0]
# additional cleaningdata.dropna(inplace =True)
data.drop("id", axis =1, inplace = True)
data.reset_index(inplace=True, drop=True)
Cleaned data for feature selection
# seperate input and output variablesX = data.drop("stroke", axis = 1)
y = data["stroke"]
for col in X.columns:
X[f"shadow_{col}"] = X[col].sample(frac=1).reset_index(drop=True)
Shadow features concatenated
def get_important_features(X, y):# Initiliaze Random Forest CLassifier
rf = RandomForestClassifier(max_depth=20)
# Fit Random Forest on provided data,y)
# Create dictionary of feature importances
importances = {feature_name: f_importance for feature_name, f_importance in zip(X.columns, rf.feature_importances_)}
# Isolate importances of Shadow features
only_shadow_feat_importance = {key:value for key,value in importances.items() if "shadow" in key}
# get importance level of most important shadow feature
highest_shadow_feature = list(dict(sorted(only_shadow_feat_importance.items(), key=lambda item: item[1], reverse=True)).values())[0]
# get original feature which fulfill boruta selection criteria
selected_features = [key for key, value in importances.items() if value > highest_shadow_feature]
return selected_features

Multiple Trials

TRIALS = 50feature_hits = {i:0 for i in data.columns}for _ in tqdm(range(TRIALS)):    imp_features = get_important_features(X, y)        for key, _ in feature_hits.items():            if key in imp_features: feature_hits[key] += 1print(feature_hits)
{'gender': 0,  'age': 50,  'hypertension': 0,  'heart_disease': 0,  'ever_married': 0,  'work_type': 0,  'Residence_type': 0,  'avg_glucose_level': 50,  'bmi': 1,  'smoking_status': 0,  'stroke': 0}

Binomial Distribution

# Calculate the probability mass function
pmf = [sp.stats.binom.pmf(x, TRIALS, .5) for x in range(TRIALS + 1)]
# trails_in_green_zonedef get_tail_items(pmf):
total = 0
for i, x in enumerate(pmf):
total += x
if total >= 0.05:
return i
# plot the binomial distributionplt.plot([i for i in range(TRIALS + 1)], pmf,"-o")
plt.title(f"Binomial distribution for {TRIALS} trials")
plt.xlabel("No. of trials")
Binomial distribution for 50 trials

Final Selection

# select features from n number of trialsdef choose_features(feature_hits, TRIALS, thresh):    #define boundries
green_zone_thresh = TRIALS - thresh
blue_zone_upper = green_zone_thresh
blue_zone_lower = thresh
green_zone = [key for key, value in feature_hits.items() if value >= green_zone_thresh] blue_zone = [key for key, value in feature_hits.items() if (value >= blue_zone_lower and value < blue_zone_upper)] return green_zone, blue_zone
thresh = get_tail_items(pmf)
green, blue = choose_features(feature_hits, TRIALS, thresh)
Important Features according to our Boruta Algorithm



