# Step by Step — Run Exploratory Data Analysis

## How can you draw relevant conclusions without knowing anything about the underlying data?

Feb 12 · 7 min read

## Get Started

`import pandas as pdimport numpy as npfrom pathlib import Pathimport matplotlib.pyplot as pltfrom matplotlib.cbook import boxplot_stats  import seaborn as sns%matplotlib inline# Read the datadf = pd.read_csv(Path.cwd()/'notes.csv')df.head()`

# Univariate Analysis

`# How many individuals do we have for each category?df.groupby(‘is_genuine’).count().iloc[:,0]`
`# Remove the boolean columntmp = df.iloc[:,1:]# Check the Distribution for each columnsfor i in tmp.columns: plt.figure(figsize=(8,6)) tmp1 = df[df[‘is_genuine’] == True] tmp2 = df[df[‘is_genuine’] == False] plt.hist(tmp1[i], bins=50, alpha=0.5, label=”genuine”) plt.hist(tmp2[i], bins=50, alpha=0.5, label=”fake”) plt.title(i) plt.legend(loc=’upper right’)`
`# Create boxplots to visualize the potential outliersfig, ax_new = plt.subplots(3,2, sharey=False,figsize=(20,17))df.boxplot(by=”is_genuine”,ax=ax_new)`

# Outliers

`# Create a function to identify the outliers for each features and for each categoriesdef get_outliers(df): Q1 = df.quantile(0.25) Q3 = df.quantile(0.75) IQR = Q3 — Q1 df_out = df[((df < (Q1–1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)]   return df_out# Apply the function at labels leveldf.groupby(‘is_genuine’).apply(get_outliers).reset_index(drop=True)`

# Bivariate Analysis

`# Visualize the correlation & distribution of the variables sns.pairplot(df)`
`# Heatmap of correlation matrixsns.heatmap(df.corr(),annot = True)`

