Sitemap
TDS Archive

An archive of data science, data analytics, data engineering, machine learning, and artificial intelligence writing from the former Towards Data Science Medium publication.

Follow publication

Using Quantum Annealing for Feature Selection in scikit-learn

11 min readApr 10, 2023

--

D-Wave and scikit-learn

Problem and method

>>> dataset.get_data()[0]

attr1 attr2 attr3 attr4 attr5 attr6 attr7 attr8 ... attr293 attr294 Beach Sunset FallFoliage Field Mountain Urban
0 0.646467 0.666435 0.685047 0.699053 0.652746 0.407864 0.150309 0.535193 ... 0.014025 0.029709 1 0 0 0 1 0
1 0.770156 0.767255 0.761053 0.745630 0.742231 0.688086 0.708416 0.757351 ... 0.082672 0.036320 1 0 0 0 0 1
2 0.793984 0.772096 0.761820 0.762213 0.740569 0.734361 0.722677 0.849128 ... 0.112506 0.083924 1 0 0 0 0 0
3 0.938563 0.949260 0.955621 0.966743 0.968649 0.869619 0.696925 0.953460 ... 0.049780 0.090959 1 0 0 0 0 0
4 0.512130 0.524684 0.520020 0.504467 0.471209 0.417654 0.364292 0.562266 ... 0.164270 0.184290 1 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2402 0.875782 0.901653 0.926227 0.721366 0.795826 0.867642 0.794125 0.899067 ... 0.254413 0.134350 0 0 0 0 0 1
2403 0.657706 0.669877 0.692338 0.713920 0.727374 0.750354 0.684372 0.718770 ... 0.048747 0.041638 0 0 0 0 0 1
2404 0.952281 0.944987 0.905556 0.836604 0.875916 0.957034 0.953938 0.967956 ... 0.017547 0.019734 0 0 0 0 0 1
2405 0.883990 0.899004 0.901019 0.904298 0.846402 0.858145 0.851362 0.852472 ... 0.226332 0.223070 0 0 0 0 0 1
2406 0.974915 0.866425 0.818144 0.936140 0.938583 0.935087 0.930597 1.000000 ... 0.025059 0.004033 0 0 0 0 0 1

[2407 rows x 300 columns]
objective function
objective function

Feature selection the hard way

import numpy as np
import openml
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import plotly.express as px
from plotly.subplots import make_subplots
import dimod
from dwave.system import LeapHybridCQMSampler

dataset = openml.datasets.get_dataset(312)

X, y, categorical_indicator, attribute_names = dataset.get_data(
target=dataset.default_target_attribute, dataset_format="dataframe"
)
X = X.astype(float)
y = y.values.astype(int)

clf = RandomForestClassifier()
def evaluate_model(m, X, y, indices=None):
if not indices is None:
X_filtered = X.iloc[:, indices]
else:
X_filtered = X
acc = np.mean(cross_val_score(clf, X_filtered, y, cv=5))
return acc


def show_relevance_redundancy(X, y, indices=None, title=""):
if not indices is None:
X = X.iloc[:, indices].copy()
y = y
fig = make_subplots(
rows=1,
cols=2,
column_widths=[0.68, 0.32],
column_titles=["relevance", "redundancy"],
)
trace_rel = px.bar(np.array([abs(np.corrcoef(x, y)[0, 1]) for x in X.values.T]))
trace_red = px.imshow(abs(np.corrcoef(X.values, rowvar=False)))
fig.add_trace(trace_rel.data[0], row=1, col=1)
fig.add_trace(trace_red.data[0], row=1, col=2)
fig.update_layout(width=1200, height=480, title=title)
fig.show()


show_relevance_redundancy(
X,
y,
None,
f"full dataset: acc={evaluate_model(clf, X, y, None):.4f}",
)
baseline performance
baseline performance
k = 30
# Pearson correlation
correlation_matrix = abs(np.corrcoef(np.hstack((X, y[:, np.newaxis])), rowvar=False))
# fix the alpha parameter from the Milne paper
# to account for the numerous quadratic terms that are possible
beta = 0.5
alpha = 2 * beta * (k - 1) / (1 - beta + 2 * beta * (k - 1))
# generate weights for linear and quadratic terms, per Milne algorithm
Rxy = correlation_matrix[:, -1]
Q = correlation_matrix[:-1, :-1] * (1 - alpha)
np.fill_diagonal(Q, -Rxy * alpha)
# create binary quadratic model from the linear and quadratic weights
bqm = dimod.BinaryQuadraticModel(Q, "BINARY")
# create constrained quadratic model
cqm = dimod.ConstrainedQuadraticModel()
# the objective function of the CQM is the same as BQM
cqm.set_objective(bqm)
# constraint: limit the number of features to k
cqm.add_constraint_from_iterable(
((i, 1) for i in range(len(cqm.variables))), "==", rhs=k
)
# the sampler that will be used is the hybrid sampler in the DWave cloud
sampler = LeapHybridCQMSampler()
# solve the problem
sampleset = sampler.sample_cqm(cqm)
# postprocess results
feasible = sampleset.filter(lambda s: s.is_feasible)
if feasible:
best = feasible.first
else:
assert len(cqm.constraints) == 1
best = sorted(
sampleset.data(),
key=lambda x: (list(cqm.violations(x.sample).values())[0], x.energy),
)[0]
assert list(best.sample.keys()) == sorted(best.sample.keys())
is_selected = np.array([bool(val) for val in best.sample.values()])
features = np.array([i for i, val in enumerate(is_selected) if val])
best_energy = best.energy
show_relevance_redundancy(
X,
y,
features,
f"explicit optimization: acc={evaluate_model(clf, X, y, features):.4f}",
)
explicit optimization
explicit optimization

Feature selection the easy way

X_new = SelectFromQuadraticModel(num_features=30, alpha=0.5).fit_transform(X, y)
X_new_df = pd.DataFrame(data=X_new, columns=list(range(X_new.shape[1])))

show_relevance_redundancy(
X_new_df,
y,
None,
f"plugin optimization: acc={evaluate_model(clf, X_new_df, y, None):.4f}",
)
plugin optimization
plugin optimization

Further topics to explore

References

--

--

TDS Archive
TDS Archive

Published in TDS Archive

An archive of data science, data analytics, data engineering, machine learning, and artificial intelligence writing from the former Towards Data Science Medium publication.

Florin Andrei
Florin Andrei

Written by Florin Andrei

BS in Physics. MS in Data Science. Over a decade experience with cloud computing.

No responses yet