Data Preprocessing- Sentiment Analysis of Electrodermal Activity Data for Personalized Interventions in Mental Health

Sarthak Increase
7 min readMay 10, 2023

--

# Import libraries
import numpy as np
import pandas as pd
import pickle
import os
from scipy import signal # For filtering
from sklearn import preprocessing # For normalizing

# Define constants
DATA_PATH = "WESAD/" # Path to the dataset folder
SUBJECTS = ["S2", "S3", "S4", "S5", "S6", "S7", "S8", "S9", "S10", "S11", "S13", "S14", "S15", "S16", "S17"] # List of subjects
DEVICE = "Empatica_E4" # Device name
MODALITIES = ["ACC", "BVP", "EDA", "TEMP"] # Sensor modalities
SRATE = {"ACC": 32, "BVP": 64, "EDA": 4, "TEMP": 4} # Sampling rates
LABELS = {1: "baseline", 2: "stress", 3: "amusement"} # Labels for affective states

# Define functions
def load_subject_data(subject):
"""
Load the data of a single subject from a pickle file.
Input: subject - string, subject ID (e.g. "S2")
Output: data - dictionary, contains labels and sensor data
"""
file_path = os.path.join(DATA_PATH, subject, subject + ".pkl")
with open(file_path, "rb") as file:
data = pickle.load(file, encoding="latin1")
return data

def resample(data, srate_in, srate_out):
"""
Resample the data to a different sampling rate using linear interpolation.
Input: data - array, original data
srate_in - int, original sampling rate
srate_out - int, desired sampling rate
Output: data_resampled - array, resampled data
"""
n_in = len(data)
n_out = int(n_in * srate_out / srate_in)
data_resampled = np.interp(np.linspace(0, n_in, n_out), np.arange(n_in), data)
return data_resampled

def filter(data, srate, lowcut=None, highcut=None):
"""
Filter the data using a Butterworth filter with specified cutoff frequencies.
Input: data - array, original data
srate - int, sampling rate of the data
lowcut - float or None, low cutoff frequency (Hz) or None for lowpass filter
highcut - float or None, high cutoff frequency (Hz) or None for highpass filter
Output: data_filtered - array, filtered data
"""

# Check the cutoff frequencies and determine the filter type and order
if lowcut is not None and highcut is not None:
filter_type = 'bandpass'
order = 4 # You can change the order as needed
nyq = 0.5 * srate # Nyquist frequency
low = lowcut / nyq # Normalize the low cutoff frequency
high = highcut / nyq # Normalize the high cutoff frequency
cutoff = [low, high] # Cutoff frequency for bandpass filter

elif lowcut is not None and highcut is None:
filter_type = 'highpass'
order = 2 # You can change the order as needed
nyq = 0.5 * srate # Nyquist frequency
cutoff = lowcut / nyq # Cutoff frequency for highpass filter

elif lowcut is None and highcut is not None:
filter_type = 'lowpass'
order = 2 # You can change the order as needed
nyq = 0.5 * srate # Nyquist frequency
cutoff = highcut / nyq # Cutoff frequency for lowpass filter

else:
raise ValueError("Invalid cutoff frequencies")

# Design the Butterworth filter using scipy.signal.butter function
b, a = signal.butter(order, cutoff, btype=filter_type)
# Apply the filter to the data using scipy.signal.filtfilt function
data_filtered = signal.filtfilt(b, a, data)

return data_filtered

def normalize(data):
"""
Normalize the data to have zero mean and unit variance using sklearn.preprocessing.scale function.
Input: data - array, original data
Output: data_normalized - array, normalized data
"""
data_normalized = preprocessing.scale(data)
return data_normalized

def smooth(data, window_size):
"""
Smooth the data using a moving average filter with specified window size.
Input: data - array, original data
window_size - int, size of the moving window
Output: data_smoothed - array, smoothed data
"""
# Create a one-dimensional window with uniform weights using numpy.ones function
window = np.ones(window_size) / window_size

# Apply the moving average filter to the data using numpy.convolve function
# Use 'same' mode to preserve the original length of the data
data_smoothed = np.convolve(data, window, mode='same')

return data_smoothed

def extract_features(data):
"""
Extract features from the raw sensor data.
Input: data - array, raw sensor data
Output: features - array, extracted features
"""
# For this example, we only use the mean and standard deviation as features
# You can add more features such as min, max, median, skewness, kurtosis, etc.
mean = np.mean(data)
std = np.std(data)
features = np.array([mean, std])
return features

def preprocess_subject_data(data):
"""
Preprocess the data of a single subject by resampling, filtering, normalizing, smoothing, segmenting and feature extraction.
Input: data - dictionary, contains labels and sensor data
Output: X - array, feature matrix (shape: n_segments x n_features)
y - array, label vector (shape: n_segments)
srates - dictionary, contains the resampled sampling rates for each modality
segments - dictionary, contains the segment start and end times for each modality
features - list, contains the feature names for each modality
"""

# Resample the label data to 4 Hz (the lowest sampling rate among the sensors)
label = resample(data["label"], 700, 4)

# Initialize empty lists to store the feature matrix and label vector
X = []
y = []

# Initialize a dictionary to store the resampled sampling rates for each modality
srates = {}

# Initialize a dictionary to store the segment start and end times for each modality
segments = {}

# Initialize a list to store the feature names for each modality
features = []

# Loop through each sensor modality
for modality in MODALITIES:

# Resample the sensor data to 4 Hz
sensor_data = resample(data[DEVICE][modality], SRATE[modality], 4)

# Store the resampled sampling rate
srates[modality] = 4

# Filter the sensor data using appropriate cutoff frequencies
# For this example, I used some arbitrary values based on common sense and literature review
# You can change them as needed or use different filters for different modalities
if modality == "ACC":
# Use a bandpass filter with low cutoff frequency of 0.5 Hz and high cutoff frequency of 10 Hz
sensor_data_filtered = filter(sensor_data, srates[modality], lowcut=0.5, highcut=10)
elif modality == "BVP":
# Use a bandpass filter with low cutoff frequency of 0.5 Hz and high cutoff frequency of 20 Hz
sensor_data_filtered = filter(sensor_data, srates[modality], lowcut=0.5, highcut=20)
elif modality == "EDA":
# Use a lowpass filter with high cutoff frequency of 5 Hz
sensor_data_filtered = filter(sensor_data, srates[modality], highcut=5)
elif modality == "TEMP":
# Use a lowpass filter with high cutoff frequency of 0.1 Hz
sensor_data_filtered = filter(sensor_data, srates[modality], highcut=0.1)

# Normalize the sensor data to have zero mean and unit variance
sensor_data_normalized = normalize(sensor_data_filtered)

# Smooth the sensor data using a moving average filter with specified window size
# For this example, I used a window size of 5 samples for each modality
# You can change it as needed or use different window sizes for different modalities
window_size = 5
sensor_data_smoothed = smooth(sensor_data_normalized, window_size)

# Segment the sensor data into non-overlapping windows of 60 seconds (240 samples)
n_samples = len(sensor_data_smoothed)
window_size = 240
n_segments = n_samples // window_size
sensor_data_segmented = np.array_split(sensor_data_smoothed[:n_segments * window_size], n_segments)

# Segment the label data accordingly
label_segmented = np.array_split(label[:n_segments * window_size], n_segments)

# Store the segment start and end times
segments[modality] = []
for i in range(n_segments):
start_time = i * window_size / srates[modality]
end_time = (i + 1) * window_size / srates[modality]
segments[modality].append((start_time, end_time))

# Loop through each segment
for segment in sensor_data_segmented:

# Extract features from the segment
segment_features = extract_features(segment)

# Append the features to the feature matrix
X.append(segment_features)

# Append the feature names to the feature list
features.extend([modality + "_" + feature for feature in ["mean", "std"]])

# Loop through each label segment
for segment in label_segmented:

# Take the most frequent label as the segment label
segment_label = np.bincount(segment).argmax()

# Append the label to the label vector
y.append(segment_label)

# Convert the feature matrix and label vector to numpy arrays
X = np.array(X)
y = np.array(y)

return X, y, srates, segments, features

# Preprocess the data of all subjects and save them as csv files
for subject in SUBJECTS:
print("Preprocessing data for subject", subject)
data = load_subject_data(subject)
X, y, srates, segments, features = preprocess_subject_data(data)
df = pd.DataFrame(X, columns=features)
df["label"] = y
df["subject"] = subject
df.to_csv(DATA_PATH + subject + "_preprocessed.csv", index=False)
print("Done!")
Received message. Okay, I will complete the code. Here is the rest of the code: ```python sensor_data_filtered = filter(sensor_data, srates[modality], lowcut=0.5, highcut=20) elif modality == "EDA": # Use a lowpass filter with high cutoff frequency of 5 Hz sensor_data_filtered = filter(sensor_data, srates[modality], highcut=5) elif modality == "TEMP": # Use a lowpass filter with high cutoff frequency of 0.1 Hz sensor_data_filtered = filter(sensor_data, srates[modality], highcut=0.1) # Normalize the sensor data to have zero mean and unit variance sensor_data_normalized = normalize(sensor_data_filtered) # Smooth the sensor data using a moving average filter with specified window size # For this example, I used a window size of 5 samples for each modality # You can change it as needed or use different window sizes for different modalities window_size = 5 sensor_data_smoothed = smooth(sensor_data_normalized, window_size) # Segment the sensor data into non-overlapping windows of 60 seconds (240 samples) n_samples = len(sensor_data_smoothed) window_size = 240 n_segments = n_samples // window_size sensor_data_segmented = np.array_split(sensor_data_smoothed[:n_segments * window_size], n_segments) # Segment the label data accordingly label_segmented = np.array_split(label[:n_segments * window_size], n_segments) # Store the segment start and end times segments[modality] = [] for i in range(n_segments): start_time = i * window_size / srates[modality] end_time = (i + 1) * window_size / srates[modality] segments[modality].append((start_time, end_time)) # Loop through each segment for segment in sensor_data_segmented: # Extract features from the segment segment_features = extract_features(segment) # Append the features to the feature matrix X.append(segment_features) # Append the feature names to the feature list features.extend([modality + "_" + feature for feature in ["mean", "std"]]) # Loop through each label segment for segment in label_segmented: # Take the most frequent label as the segment label segment_label = np.bincount(segment).argmax() # Append the label to the label vector y.append(segment_label) # Convert the feature matrix and label vector to numpy arrays X = np.array(X) y = np.array(y) return X, y, srates, segments, features # Preprocess the data of all subjects and save them as csv files for subject in SUBJECTS: print("Preprocessing data for subject", subject) data = load_subject_data(subject) X, y, srates, segments, features = preprocess_subject_data(data) df = pd.DataFrame(X, columns=features) df["label"] = y df["subject"] = subject df.to_csv(DATA_PATH + subject + "_preprocessed.csv", index=False) print("Done!") ```

--

--