Normality Testing Examples
Dataset ABC Company Profit
Implementing Different Normality Testing approaches in Dataset ABC Company Profit.csv
Importing libraries and Loading the Dataset
import pandas as pd
import numpy as np
import math
from scipy.stats import norm
import matplotlib.pyplot as plt
%matplotlib inline
profit = pd.read_csv("CompanyABCProfit.csv", encoding='latin-1', na_values='NA')
Exploration of dataset
profit.Profit.hist(density=True, bins=20)
#exploration
Calculating Mean Variance and Standard Deviation
mean = profit.Profit.mean()
variance = profit.Profit.var()
stddev = math.sqrt(variance)
print("Mean from maximum likelihood",mean)
print("Variance from maximum likelihood",variance)
print("Standard deviation from maximum likelihood",stddev)
We calculate the mean and variance of the data, then plot the normal pdf on top of the histogram
profit.Profit.hist(density = True)
x_min,x_max = plt.xlim()
plt.plot(np.linspace(x_min,x_max), norm.pdf(np.linspace(x_min,x_max),mean,stddev))
Calculating the mean and standard deviation after fitting
from scipy.stats import norm
mu, std = norm.fit(profit.Profit)
print("Mean after fitting", mu)
print("Standard deviation after fitting", std)
Since the mean and standard deviation before and after fitting are so close, the normal pdf s plotted before and after fitting overlaps each other
# plotting before and after fitting
profit.Profit.hist(density = True)
x_min,x_max = plt.xlim()
plt.plot(np.linspace(x_min,x_max), norm.pdf(np.linspace(x_min,x_max),mean,stddev),linestyle='dashed', color='green')
plt.plot(np.linspace(x_min,x_max), norm.pdf(np.linspace(x_min,x_max),mu,std),linestyle='dashed', color='yellow')
Shapiro Wilk Test
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
# Generate a random sample from a normal distribution
data = profit.Profit
# Perform the Shapiro-Wilk test
w, p_value = stats.shapiro(data)
# Check the p-value of the test
alpha = 0.05
if p_value > alpha:
print("Sample looks normal (fail to reject H0)")
else:
print("Sample does not look normal (reject H0)")
# Generate a Q-Q plot
stats.probplot(data, plot=plt)
plt.show()
Anderson-Darling test
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
data = profit.Profit
# Perform the Anderson-Darling test
result = stats.anderson(data)
# Check the p-value of the test
critical_values = result.critical_values
significance_level = result.significance_level
for i in range(len(critical_values)):
if result.statistic < critical_values[i]:
print("Sample looks normal (fail to reject H0) at significance level: ", significance_level[i])
else:
print("Sample does not look normal (reject H0) at significance level: ", significance_level[i])
# Generate a Q-Q plot
stats.probplot(data, plot=plt)
plt.show()
Kolmogorov-Smirnov test
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
data = profit.Profit
# Perform the Kolmogorov-Smirnov test
D, p_value = stats.kstest(data, 'norm')
# Check the p-value of the test
alpha = 0.05
if p_value > alpha:
print("Sample looks normal (fail to reject H0)")
else:
print("Sample does not look normal (reject H0)")
# Generate a Q-Q plot
stats.probplot(data, plot=plt)
plt.show()
Weight Height Example
Dataset weight-height.csv
Implementing Different Normality Testing approaches in Dataset weight-height.csv
Loading the Dataset
weight_height = pd.read_csv("weight-height.csv", encoding='latin-1', na_values='NA')
We’re considering only height for this example
height = weight_height.Height
Shapiro Wilk Test
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
# Generate a random sample from a normal distribution
np.random.seed(0)
data = height
# Perform the Shapiro-Wilk test
w, p_value = stats.shapiro(data)
# Check the p-value of the test
alpha = 0.05
if p_value > alpha:
print("Sample looks normal (fail to reject H0)")
else:
print("Sample does not look normal (reject H0)")
# Generate a Q-Q plot
stats.probplot(data, plot=plt)
plt.show()
Anderson-Darling test
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
# Generate a random sample from a normal distribution
np.random.seed(0)
data = height
# Perform the Anderson-Darling test
result = stats.anderson(data)
# Check the p-value of the test
critical_values = result.critical_values
significance_level = result.significance_level
for i in range(len(critical_values)):
if result.statistic < critical_values[i]:
print("Sample looks normal (fail to reject H0) at significance level: ", significance_level[i])
else:
print("Sample does not look normal (reject H0) at significance level: ", significance_level[i])
# Generate a Q-Q plot
stats.probplot(data, plot=plt)
plt.show()
Kolmogorov-Smirnov test
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
# Generate a random sample from a normal distribution
np.random.seed(0)
data = height
# Perform the Kolmogorov-Smirnov test
D, p_value = stats.kstest(data, 'norm')
# Check the p-value of the test
alpha = 0.05
if p_value > alpha:
print("Sample looks normal (fail to reject H0)")
else:
print("Sample does not look normal (reject H0)")
# Generate a Q-Q plot
stats.probplot(data, plot=plt)
plt.show()