Normality Testing Examples

Ananthakrishnan Harikumar
AI Skunks
Published in
4 min readMar 31, 2023

Dataset ABC Company Profit

Implementing Different Normality Testing approaches in Dataset ABC Company Profit.csv

Importing libraries and Loading the Dataset

import pandas as pd
import numpy as np
import math
from scipy.stats import norm
import matplotlib.pyplot as plt
%matplotlib inline

profit = pd.read_csv("CompanyABCProfit.csv", encoding='latin-1', na_values='NA')

Exploration of dataset

profit.Profit.hist(density=True, bins=20)
#exploration

Calculating Mean Variance and Standard Deviation

mean = profit.Profit.mean()
variance = profit.Profit.var()
stddev = math.sqrt(variance)
print("Mean from maximum likelihood",mean)
print("Variance from maximum likelihood",variance)
print("Standard deviation from maximum likelihood",stddev)

We calculate the mean and variance of the data, then plot the normal pdf on top of the histogram

profit.Profit.hist(density = True)
x_min,x_max = plt.xlim()
plt.plot(np.linspace(x_min,x_max), norm.pdf(np.linspace(x_min,x_max),mean,stddev))

Calculating the mean and standard deviation after fitting

from scipy.stats import norm
mu, std = norm.fit(profit.Profit)

print("Mean after fitting", mu)
print("Standard deviation after fitting", std)

Since the mean and standard deviation before and after fitting are so close, the normal pdf s plotted before and after fitting overlaps each other

# plotting before and after fitting
profit.Profit.hist(density = True)
x_min,x_max = plt.xlim()
plt.plot(np.linspace(x_min,x_max), norm.pdf(np.linspace(x_min,x_max),mean,stddev),linestyle='dashed', color='green')
plt.plot(np.linspace(x_min,x_max), norm.pdf(np.linspace(x_min,x_max),mu,std),linestyle='dashed', color='yellow')

Shapiro Wilk Test

import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt

# Generate a random sample from a normal distribution
data = profit.Profit
# Perform the Shapiro-Wilk test
w, p_value = stats.shapiro(data)

# Check the p-value of the test
alpha = 0.05
if p_value > alpha:
print("Sample looks normal (fail to reject H0)")
else:
print("Sample does not look normal (reject H0)")

# Generate a Q-Q plot
stats.probplot(data, plot=plt)
plt.show()

Anderson-Darling test

import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt


data = profit.Profit
# Perform the Anderson-Darling test
result = stats.anderson(data)

# Check the p-value of the test
critical_values = result.critical_values
significance_level = result.significance_level
for i in range(len(critical_values)):
if result.statistic < critical_values[i]:
print("Sample looks normal (fail to reject H0) at significance level: ", significance_level[i])
else:
print("Sample does not look normal (reject H0) at significance level: ", significance_level[i])

# Generate a Q-Q plot
stats.probplot(data, plot=plt)
plt.show()

Kolmogorov-Smirnov test

import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt

data = profit.Profit
# Perform the Kolmogorov-Smirnov test
D, p_value = stats.kstest(data, 'norm')

# Check the p-value of the test
alpha = 0.05
if p_value > alpha:
print("Sample looks normal (fail to reject H0)")
else:
print("Sample does not look normal (reject H0)")

# Generate a Q-Q plot
stats.probplot(data, plot=plt)
plt.show()

Weight Height Example

Dataset weight-height.csv

Implementing Different Normality Testing approaches in Dataset weight-height.csv

Loading the Dataset

weight_height = pd.read_csv("weight-height.csv", encoding='latin-1', na_values='NA')

We’re considering only height for this example

height = weight_height.Height

Shapiro Wilk Test

import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt

# Generate a random sample from a normal distribution
np.random.seed(0)
data = height

# Perform the Shapiro-Wilk test
w, p_value = stats.shapiro(data)

# Check the p-value of the test
alpha = 0.05
if p_value > alpha:
print("Sample looks normal (fail to reject H0)")
else:
print("Sample does not look normal (reject H0)")

# Generate a Q-Q plot
stats.probplot(data, plot=plt)
plt.show()

Anderson-Darling test

import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt

# Generate a random sample from a normal distribution
np.random.seed(0)
data = height

# Perform the Anderson-Darling test
result = stats.anderson(data)

# Check the p-value of the test
critical_values = result.critical_values
significance_level = result.significance_level
for i in range(len(critical_values)):
if result.statistic < critical_values[i]:
print("Sample looks normal (fail to reject H0) at significance level: ", significance_level[i])
else:
print("Sample does not look normal (reject H0) at significance level: ", significance_level[i])

# Generate a Q-Q plot
stats.probplot(data, plot=plt)
plt.show()

Kolmogorov-Smirnov test

import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt

# Generate a random sample from a normal distribution
np.random.seed(0)
data = height

# Perform the Kolmogorov-Smirnov test
D, p_value = stats.kstest(data, 'norm')

# Check the p-value of the test
alpha = 0.05
if p_value > alpha:
print("Sample looks normal (fail to reject H0)")
else:
print("Sample does not look normal (reject H0)")

# Generate a Q-Q plot
stats.probplot(data, plot=plt)
plt.show()

--

--

Ananthakrishnan Harikumar
AI Skunks
0 Followers
Writer for

A common man who believes in love and compassion more than religion and boundaries.