Exploratory Data Analysis

Tess Ofili
5 min readAug 22, 2022

--

Exploratory data analysis (EDA) is a type of study that seeks out broad trends in the data. These patterns include anomalies and potentially surprising aspects of the data. Any data analysis should begin with EDA.

The data below shows the performance in Sales Department.

# Import all necessary libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset (10Alytics Data)

data = pd.read_csv(‘10Alytics Data.csv’)

#top 5 rows

data.head(5)

#botton 5 rows

data.tail(5)

#full info

data.info()

#describing the data

data.describe()

#using loc function, to group the age group of the customer

data[‘Age_Group’] = ‘’

data.loc[data[‘Customer_Age’] < 19,’Age_Group’] = ‘teenagers’
data.loc[(data[‘Customer_Age’] > 19) & (data[‘Customer_Age’] <=30), ‘Age_Group’] = ‘youth’
data.loc[(data[‘Customer_Age’] > 30) & (data[‘Customer_Age’]<=40),’Age_Group’] = ‘adult’
data.loc[data[‘Customer_Age’]> 40, ‘Age_Group’] =’elder’

data.head()

#creating new columns with the following informations,
#Total Cost = Quantity * Unit_cost
#Total Revenue = Quantity * Unit_Price
#Profit/Loss = Total Revenue — Total Cost

data[‘Total_Cost’] = data[‘Quantity’]*data[‘Unit_Cost’]
data[‘Total_Revenue’] = data[‘Quantity’]*data[‘Unit_Price’]
data[‘Profit/Loss’] = data[‘Total_Revenue’] — data[‘Total_Cost’]

data.head()

#Create a new column and categorize Profit/Loss
#Less than 0 = Loss
#Greater than 0 = Profit

data[‘Profit_Loss’] = ‘’

data.loc[data[‘Profit/Loss’] > 0,’Profit_Loss’] = ‘Profit’
data.loc[data[‘Profit/Loss’] < 0, ‘Profit_Loss’] = ‘Loss’

data.head()

Data Visualization will be used to answer the following questions

  • What customer is the highest?
  • Which salesperson is highest?
  • Which category is the highest?
  • Which State is the highest?
  • Which product is the highest?
  • Which Subcategory is the highest?
  • Which is the highest, profit or loss

#replacing a value in customer column, correcting the spelling of the high in the customer column to make it uniform.

data[‘Customer’] = data[‘Customer’].replace([‘Hign’],’High’)

#renaming some column names to make it uniform

data.rename(columns = {‘Sales Person’:’Sales_Person’, ‘Payment Option’:’Payment_Option’}, inplace = True)

data

#using pie chart,Use a pie chart to visualize customer. What customer is the highest?

customer = data.groupby(‘Customer’).Customer.count().sort_values(ascending=False)

plt.figure(figsize=(5,5))
plt.pie(customer,labels=customer.index)

plt.title(‘Customer By Count’,
loc =’left’, )
plt.show()

The medium cutomers are the highest

#Use the bar chart to visualize Salesperson. Which salesperson is highest?

sales_person = data.groupby(‘Sales_Person’).Sales_Person.count().sort_values(ascending=False)

plt.bar(x=sales_person.index[:10], height=sales_person)

plt.xlabel(‘sales_person’)
plt.ylabel(‘count_sales_person’)
plt.title(‘Sales By Count’,
loc =’left’, )

plt.show()

Remota is the highest salesperson

#Use the column chart to visualize customer age bracket. Which category is the highest?

customer_age =data.groupby(‘Age_Group’)[‘Age_Group’].count().sort_values(ascending=True)
customer_age

plt.title(‘customer_age’,
loc =’left’, )
plt.barh(y=customer_age.index, width=customer_age, color=’orange’)

plt.show()

The elder group are the highest

# Use the bar chart to visualize State. Which State is the highest?

state = data.groupby(‘State’).State.count().sort_values(ascending=False)[:5]

plt.bar(x=state.index[:5], height=state[:5])

plt.xlabel(‘state’)
plt.ylabel(‘count_state’)

plt.title(‘State By Count’,
loc =’left’, )

plt.show()

Lagos has the highest

#Use the pie chart to visualize product. Which product is the highest?

pdts = data.groupby(‘Product_Category’).Product_Category.count().sort_values(ascending=False)

plt.figure(figsize=(5,5))
plt.pie(pdts,labels=pdts.index)

plt.title(‘Products By Count’,
loc =’left’, )
plt.show()

Accessories has the highest

# Use the bar chart to visualize Sub_category. Which Sub_category is the highest?

Sub_Catg =data.groupby(‘Sub_Category’)[‘Sub_Category’].count().sort_values(ascending=False)[:5]
customer_age

Sub_Catg = data.groupby(‘Sub_Category’).Sub_Category.count().sort_values(ascending=False)[:5]

plt.bar(x=Sub_Catg .index[:5], height=Sub_Catg [:5])

plt.xlabel(‘Sub_Catg ‘)
plt.ylabel(‘Count_Sub_Category’)
plt.title(‘Subcategory By Count’,
loc =’left’, )

plt.show()

Keyboard has the highest sub categories

# Use the column chart to visualize profit/Loss. Which is the highest?

P_L =data.groupby(‘Profit_Loss’)[‘Profit_Loss’].count().sort_values(ascending=False)
customer_age

plt.title(‘Profit/Loss’,
loc =’left’, )
plt.barh(y=P_L .index, width=P_L , color=’green’)
plt.title(‘Profit/loss By Count’,
loc =’left’, )
plt.show()

The profit was higher.

More Data Analytics

  • Which Salesperson generated the most profit?
  • Which gender generates the most profit?
  • Which State generates the most profit
  • What product generates the most profit?
  • What payment option generates the most profit?

#Which Salesperson generated the most profit?

salesper = data.groupby(‘Sales_Person’)[‘Profit/Loss’].sum().sort_values(ascending=True)

plt.barh(y=salesper.index, width=salesper)

plt.title(‘Top Sales Person’,
loc =’left’, )
plt.show()

Feyisola made the most profit

#Which gender generates the most profit?

gend = data.groupby(‘Customer_Gender’)[‘Profit/Loss’].sum().sort_values(ascending=True)

plt.bar(x=gend .index[:5], height=gend )

plt.xlabel(‘gender ‘)
plt.ylabel(‘sum Profit/Loss’)

plt.title(‘Most Profitable By Gender’,
loc =’right’, )
plt.show()

male are the most profitable gender

#Which State generates the most profit

state_profit = data.groupby(‘State’)[‘Profit/Loss’].sum().sort_values(ascending=False)

plt.bar(x=state_profit.index[:5], height=state_profit[:5])

plt.xlabel(‘state’)
plt.ylabel(‘sum profit/loss’)

plt.title(‘Most Profitable By State’,
loc =’right’, )

plt.show()

Lagos generates the most profit

# What product generates the most profit?

prdt = data.groupby(‘Product_Category’)[‘Profit/Loss’].sum().sort_values(ascending=False)

plt.barh(y=prdt.index, width=prdt)

plt.title(‘Top Sales by Product’,
loc =’left’, )
plt.show()

accessories has the most profit

# What payment option generates the most profit?

paymt = data.groupby(‘Payment_Option’)[‘Profit/Loss’].sum().sort_values(ascending=True)

plt.bar(x=paymt.index[:5], height=paymt[:5])

plt.xlabel(‘paymt’)
plt.ylabel(‘sum profit/loss’)

plt.title(‘Most Profitable By Payment’,
loc =’right’, )

plt.show()

cash payment method made the highest profit

Exploratory Data Analysis (EDA) has provided us with information about the firm by revealing the areas where profits were produced in large quantities. This will help the company determine where to enhance its products and the markets in which they are successful. Additionally, it highlighted the top salesman in terms of the revenue generated for the business.

--

--