Data Clean

VAV Data
2 min readMar 21, 2024

--

here, we will clean data of different types.

Photo by Daiga Ellaby on Unsplash

This is code of creating a daataaframeand cleaning data with help of pandas library.

import pandas as pd

# Sample dataset with missing values and different data types
data = {
'Age': [25, 30, 35, None, 40],
'Gender': ['Male', 'Female', 'Male', 'Male', 'Female'],
'Salary': [50000, 60000, None, 70000, 80000],
'Education': ['Bachelors', 'Masters', 'PhD', 'Bachelors', 'Masters'],
'City': ['New York', 'Chicago', 'Los Angeles', 'New York', 'Chicago']
}

df = pd.DataFrame(data)

Photo by Jonathan Chng on Unsplash
# Categorical data cleaning
df['Gender'].fillna(df['Gender'].mode()[0], inplace=True)


# Continuous data cleaning
df['Age'].fillna(df['Age'].mean(), inplace=True)
df['Salary'].fillna(df['Salary'].median(), inplace=True)


# Nominal data cleaning
df['Education'].fillna('Unknown', inplace=True)
df['City'].fillna('Unknown', inplace=True)


print("Cleaned Data:")
print(df)

This is for numpy.

Photo by Kim Greenhalgh on Unsplash
import numpy as np

# Sample dataset with missing values and different data types
data = np.array([
[25, 'Male', 50000],
[30, 'Female', 60000],
[35, 'Male', np.nan],
[None, 'Male', 70000],
[40, 'Female', 80000]
])

now below are code for clenaing data with help of numpy libbrary.

# Categorical data cleaning
gender_mode = np.nanmax(np.unique(data[:, 1], return_counts=True)[1])
data[:, 1][np.where(data[:, 1] == np.nan)] = gender_mode




# Continuous data cleaning
age_mean = np.nanmean(data[:, 0].astype(float))
salary_median = np.nanmedian(data[:, 2].astype(float))
data[:, 0][np.where(np.isnan(data[:, 0].astype(float)))] = age_mean
data[:, 2][np.where(np.isnan(data[:, 2].astype(float)))] = salary_median




print("Cleaned Data:")
print(data)

--

--