Geek Culture
Published in

Geek Culture

Boruta Feature Selection Explained in Python

Implementation and explanation from scratch

Boruta Feature Selection

Need for Feature Selection?

Boruta Algorithm

Implementation

Load and process data

# important librariesimport pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import scipy as sp
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
data =  pd.read_csv("healthcare-dataset-stroke-data.csv")
data.head()
Heart Stroke Dataset
# converting to numericdata["gender"] = pd.factorize(data["gender"])[0]
data["ever_married"] = pd.factorize(data["ever_married"])[0]
data["work_type"] = pd.factorize(data["work_type"])[0]
data["Residence_type"] = pd.factorize(data["Residence_type"])[0]
data["smoking_status"] = pd.factorize(data["smoking_status"])[0]
# additional cleaningdata.dropna(inplace =True)
data.drop("id", axis =1, inplace = True)
data.reset_index(inplace=True, drop=True)
data.head()
Cleaned data for feature selection
# seperate input and output variablesX = data.drop("stroke", axis = 1)
y = data["stroke"]
for col in X.columns:
X[f"shadow_{col}"] = X[col].sample(frac=1).reset_index(drop=True)
Shadow features concatenated
def get_important_features(X, y):# Initiliaze Random Forest CLassifier
rf = RandomForestClassifier(max_depth=20)
# Fit Random Forest on provided data
rf.fit(X,y)
# Create dictionary of feature importances
importances = {feature_name: f_importance for feature_name, f_importance in zip(X.columns, rf.feature_importances_)}
# Isolate importances of Shadow features
only_shadow_feat_importance = {key:value for key,value in importances.items() if "shadow" in key}
# get importance level of most important shadow feature
highest_shadow_feature = list(dict(sorted(only_shadow_feat_importance.items(), key=lambda item: item[1], reverse=True)).values())[0]
# get original feature which fulfill boruta selection criteria
selected_features = [key for key, value in importances.items() if value > highest_shadow_feature]
return selected_features

Multiple Trials

TRIALS = 50feature_hits = {i:0 for i in data.columns}for _ in tqdm(range(TRIALS)):    imp_features = get_important_features(X, y)        for key, _ in feature_hits.items():            if key in imp_features: feature_hits[key] += 1print(feature_hits)
{'gender': 0,  'age': 50,  'hypertension': 0,  'heart_disease': 0,  'ever_married': 0,  'work_type': 0,  'Residence_type': 0,  'avg_glucose_level': 50,  'bmi': 1,  'smoking_status': 0,  'stroke': 0}

Binomial Distribution

# Calculate the probability mass function
pmf = [sp.stats.binom.pmf(x, TRIALS, .5) for x in range(TRIALS + 1)]
# trails_in_green_zonedef get_tail_items(pmf):
total = 0
for i, x in enumerate(pmf):
total += x
if total >= 0.05:
break
return i
# plot the binomial distributionplt.plot([i for i in range(TRIALS + 1)], pmf,"-o")
plt.title(f"Binomial distribution for {TRIALS} trials")
plt.xlabel("No. of trials")
plt.ylabel("Probability")
plt.grid(True)
Binomial distribution for 50 trials

Final Selection

# select features from n number of trialsdef choose_features(feature_hits, TRIALS, thresh):    #define boundries
green_zone_thresh = TRIALS - thresh
blue_zone_upper = green_zone_thresh
blue_zone_lower = thresh
green_zone = [key for key, value in feature_hits.items() if value >= green_zone_thresh] blue_zone = [key for key, value in feature_hits.items() if (value >= blue_zone_lower and value < blue_zone_upper)] return green_zone, blue_zone
thresh = get_tail_items(pmf)
green, blue = choose_features(feature_hits, TRIALS, thresh)
green,blue
Important Features according to our Boruta Algorithm

--

--

A new tech publication by Start it up (https://medium.com/swlh).

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store
Moosa Ali

Blogger | Data Scientist | Machine Learning Engineer. For more content, visit: www.writersbyte.com. Support me on: ko-fi.com/moosaali9906